From ab69139ade39a176d1ea938a05163d1883eaee9f Mon Sep 17 00:00:00 2001 From: ImmanuelSegol <3ditds@gmail.com> Date: Sun, 16 Jul 2023 07:31:41 -0400 Subject: [PATCH] Goicicle (#77) --- .gitignore | 3 + curve_parameters/bn254.json | 2 +- curve_parameters/new_curve_script.py | 2 +- goicicle/Makefile | 29 + goicicle/README.md | 49 + goicicle/curves/bls12377/g1.go | 505 + goicicle/curves/bls12377/g2.go | 109 + goicicle/curves/bls12377/msm.go | 90 + goicicle/curves/bls12377/ntt.go | 73 + goicicle/curves/bls12377/utils.go | 25 + goicicle/curves/bls12377/utils_test.go | 81 + goicicle/curves/bls12381/g1.go | 505 + goicicle/curves/bls12381/g2.go | 108 + goicicle/curves/bls12381/msm.go | 90 + goicicle/curves/bls12381/ntt.go | 73 + goicicle/curves/bls12381/utils.go | 25 + goicicle/curves/bls12381/utils_test.go | 81 + goicicle/curves/bn254/g1.go | 503 + goicicle/curves/bn254/g1_test.go | 229 + goicicle/curves/bn254/g2.go | 235 + goicicle/curves/bn254/g2_test.go | 18 + goicicle/curves/bn254/msm.go | 187 + goicicle/curves/bn254/msm_test.go | 391 + goicicle/curves/bn254/ntt.go | 202 + goicicle/curves/bn254/ntt_test.go | 219 + goicicle/curves/bn254/utils.go | 34 + goicicle/curves/bn254/utils_test.go | 81 + goicicle/curves/bn254/vec_mod.go | 41 + goicicle/go.mod | 20 + goicicle/go.sum | 25 + goicicle/goicicle.go | 58 + goicicle/templates/curves/curves.go | 37 + goicicle/templates/curves/g1.go.tmpl | 469 + goicicle/templates/curves/g2.go.tmpl | 83 + goicicle/templates/curves/imports.go.tmpl | 34 + goicicle/templates/hfiles/c_api.h.tmpl | 15 + goicicle/templates/hfiles/msm.h.tmpl | 35 + goicicle/templates/hfiles/ntt.h.tmpl | 27 + goicicle/templates/hfiles/ve_mod_mult.h.tmpl | 24 + goicicle/templates/main.go | 161 + goicicle/templates/msm/msm.go.tmpl | 71 + goicicle/templates/ntt/ntt.go.tmpl | 54 + icicle/CMakeLists.txt | 8 +- icicle/appUtils/msm/msm.cu | 1385 +- icicle/appUtils/msm/msm.cuh | 4 +- icicle/appUtils/msm/tests/msm_test.cu | 139 +- icicle/appUtils/ntt/lde.cu | 64 +- icicle/appUtils/ntt/ntt.cuh | 38 +- .../vector_manipulation/ve_mod_mult.cuh | 25 + icicle/curves/bls12_377/c_api.h | 33 + icicle/curves/bls12_377/curve_config.cuh | 3 + icicle/curves/bls12_377/lde.cu | 19 +- icicle/curves/bls12_377/msm.cu | 4 +- icicle/curves/bls12_377/msm.h | 53 + icicle/curves/bls12_377/ntt.h | 44 + icicle/curves/bls12_377/params.cuh | 2 +- icicle/curves/bls12_377/ve_mod_mult.h | 41 + icicle/curves/bls12_381/c_api.h | 32 + icicle/curves/bls12_381/curve_config.cuh | 3 + icicle/curves/bls12_381/lde.cu | 26 +- icicle/curves/bls12_381/msm.cu | 4 +- icicle/curves/bls12_381/msm.h | 53 + icicle/curves/bls12_381/ntt.h | 44 + icicle/curves/bls12_381/params.cuh | 3 + .../curves/bls12_381/supported_operations.cu | 2 +- icicle/curves/bls12_381/ve_mod_mult.h | 41 + icicle/curves/bn254/c_api.h | 34 + icicle/curves/bn254/cuda.h | 14752 ++++++++++++++++ icicle/curves/bn254/cuda_runtime.h | 2039 +++ icicle/curves/bn254/curve_config.cuh | 3 + icicle/curves/bn254/lde.cu | 238 +- icicle/curves/bn254/msm.cu | 190 +- icicle/curves/bn254/msm.h | 62 + icicle/curves/bn254/ntt.h | 68 + icicle/curves/bn254/params.cuh | 113 +- icicle/curves/bn254/projective.cu | 2 +- icicle/curves/bn254/ve_mod_mult.cu | 15 + icicle/curves/bn254/ve_mod_mult.h | 41 + icicle/curves/curve_template/lde.cu | 49 +- icicle/curves/curve_template/msm.cu | 4 +- icicle/curves/curve_template/projective.cu | 2 +- icicle/primitives/extension_field.cuh | 69 +- icicle/primitives/field.cuh | 511 +- icicle/primitives/projective.cuh | 131 +- icicle/primitives/test.cu | 331 +- icicle/primitives/test_kernels.cuh | 112 +- icicle/utils/mont.cuh | 25 + src/test_bn254.rs | 64 +- 88 files changed, 25529 insertions(+), 499 deletions(-) create mode 100644 goicicle/Makefile create mode 100644 goicicle/README.md create mode 100644 goicicle/curves/bls12377/g1.go create mode 100644 goicicle/curves/bls12377/g2.go create mode 100644 goicicle/curves/bls12377/msm.go create mode 100644 goicicle/curves/bls12377/ntt.go create mode 100644 goicicle/curves/bls12377/utils.go create mode 100644 goicicle/curves/bls12377/utils_test.go create mode 100644 goicicle/curves/bls12381/g1.go create mode 100644 goicicle/curves/bls12381/g2.go create mode 100644 goicicle/curves/bls12381/msm.go create mode 100644 goicicle/curves/bls12381/ntt.go create mode 100644 goicicle/curves/bls12381/utils.go create mode 100644 goicicle/curves/bls12381/utils_test.go create mode 100644 goicicle/curves/bn254/g1.go create mode 100644 goicicle/curves/bn254/g1_test.go create mode 100644 goicicle/curves/bn254/g2.go create mode 100644 goicicle/curves/bn254/g2_test.go create mode 100644 goicicle/curves/bn254/msm.go create mode 100644 goicicle/curves/bn254/msm_test.go create mode 100644 goicicle/curves/bn254/ntt.go create mode 100644 goicicle/curves/bn254/ntt_test.go create mode 100644 goicicle/curves/bn254/utils.go create mode 100644 goicicle/curves/bn254/utils_test.go create mode 100644 goicicle/curves/bn254/vec_mod.go create mode 100644 goicicle/go.mod create mode 100644 goicicle/go.sum create mode 100644 goicicle/goicicle.go create mode 100644 goicicle/templates/curves/curves.go create mode 100644 goicicle/templates/curves/g1.go.tmpl create mode 100644 goicicle/templates/curves/g2.go.tmpl create mode 100644 goicicle/templates/curves/imports.go.tmpl create mode 100644 goicicle/templates/hfiles/c_api.h.tmpl create mode 100644 goicicle/templates/hfiles/msm.h.tmpl create mode 100644 goicicle/templates/hfiles/ntt.h.tmpl create mode 100644 goicicle/templates/hfiles/ve_mod_mult.h.tmpl create mode 100644 goicicle/templates/main.go create mode 100644 goicicle/templates/msm/msm.go.tmpl create mode 100644 goicicle/templates/ntt/ntt.go.tmpl create mode 100644 icicle/curves/bls12_377/c_api.h create mode 100644 icicle/curves/bls12_377/msm.h create mode 100644 icicle/curves/bls12_377/ntt.h create mode 100644 icicle/curves/bls12_377/ve_mod_mult.h create mode 100644 icicle/curves/bls12_381/c_api.h create mode 100644 icicle/curves/bls12_381/msm.h create mode 100644 icicle/curves/bls12_381/ntt.h create mode 100644 icicle/curves/bls12_381/ve_mod_mult.h create mode 100644 icicle/curves/bn254/c_api.h create mode 100644 icicle/curves/bn254/cuda.h create mode 100644 icicle/curves/bn254/cuda_runtime.h create mode 100644 icicle/curves/bn254/msm.h create mode 100644 icicle/curves/bn254/ntt.h create mode 100644 icicle/curves/bn254/ve_mod_mult.h create mode 100644 icicle/utils/mont.cuh diff --git a/.gitignore b/.gitignore index c8634e3e2..01989aace 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,9 @@ *.cubin *.bin *.fatbin +*.so +*.nsys-rep +*.ncu-rep **/target **/.vscode **/.*lock*csv# diff --git a/curve_parameters/bn254.json b/curve_parameters/bn254.json index ecacfd71a..4fcaa16d0 100644 --- a/curve_parameters/bn254.json +++ b/curve_parameters/bn254.json @@ -7,7 +7,7 @@ "modulus_q" : 21888242871839275222246405745257275088696311157297823662689037894645226208583, "bit_count_q" : 254, "limb_q" : 8, - "root_of_unity" : 19103219067921713944291392827692070036145651957329286315305642004821462161904, + "root_of_unity": 19103219067921713944291392827692070036145651957329286315305642004821462161904, "weierstrass_b" : 3, "weierstrass_b_g2_re" : 19485874751759354771024239261021720505790618469301721065564631296452457478373, "weierstrass_b_g2_im" : 266929791119991161246907387137283842545076965332900288569378510910307636690, diff --git a/curve_parameters/new_curve_script.py b/curve_parameters/new_curve_script.py index cbf206fe1..923caf2a2 100644 --- a/curve_parameters/new_curve_script.py +++ b/curve_parameters/new_curve_script.py @@ -313,4 +313,4 @@ def get_params(config): with open('./src/lib.rs', 'r+') as f: lib_text = f.read() if lib_text.find(curve_name_lower) == -1: - f.write('\npub mod ' + curve_name_lower + ';') \ No newline at end of file + f.write('\npub mod ' + curve_name_lower + ';') diff --git a/goicicle/Makefile b/goicicle/Makefile new file mode 100644 index 000000000..c8193dd3c --- /dev/null +++ b/goicicle/Makefile @@ -0,0 +1,29 @@ +CUDA_ROOT_DIR = /usr/local/cuda +NVCC = $(CUDA_ROOT_DIR)/bin/nvcc +CFLAGS = -Xcompiler -fPIC -std=c++17 +LDFLAGS = -shared +FEATURES = -DG2_DEFINED + +TARGET_BN254 = libbn254.so +TARGET_BLS12_381 = libbls12_381.so +TARGET_BLS12_377 = libbls12_377.so + +VPATH = ../icicle/curves/bn254:../icicle/curves/bls12_377:../icicle/curves/bls12_381 + +SRCS_BN254 = lde.cu msm.cu projective.cu ve_mod_mult.cu +SRCS_BLS12_381 = lde.cu msm.cu projective.cu ve_mod_mult.cu poseidon.cu +SRCS_BLS12_377 = lde.cu msm.cu projective.cu ve_mod_mult.cu + +all: $(TARGET_BN254) $(TARGET_BLS12_381) $(TARGET_BLS12_377) + +$(TARGET_BN254): + $(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bn254/, $(SRCS_BN254)) -o $@ + +$(TARGET_BLS12_381): + $(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bls12_381/, $(SRCS_BLS12_381)) -o $@ + +$(TARGET_BLS12_377): + $(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bls12_377/, $(SRCS_BLS12_377)) -o $@ + +clean: + rm -f $(TARGET_BN254) $(TARGET_BLS12_381) $(TARGET_BLS12_377) \ No newline at end of file diff --git a/goicicle/README.md b/goicicle/README.md new file mode 100644 index 000000000..a67e1baa6 --- /dev/null +++ b/goicicle/README.md @@ -0,0 +1,49 @@ +# ICICLE CUDA to Golang Binding Guide + +This guide provides instructions on how to compile CUDA code using the provided Makefile, and then how to use the resulting shared libraries to bind Golang to ICICLE's CUDA code. + +## Prerequisites + +To compile the CUDA files, you will need: + +- CUDA toolkit installed. The Makefile assumes CUDA is installed in `/usr/local/cuda`. If CUDA is installed in a different location, please adjust the `CUDA_ROOT_DIR` variable accordingly. +- A compatible GPU and corresponding driver installed on your machine. + +## Structure of the Makefile + +The Makefile is designed to compile CUDA files for three curves: BN254, BLS12_381, and BLS12_377. The source files are located in the `icicle/curves/` directory. + +## Compiling CUDA Code + +1. Navigate to the directory containing the Makefile in your terminal. +2. To compile all curve libraries, use the `make all` command. This will create three shared libraries: `libbn254.so`, `libbls12_381.so`, and `libbls12_377.so`. +3. If you want to compile a specific curve, you can do so by specifying the target. For example, to compile only the BN254 curve, use `make libbn254.so`. Replace `libbn254.so` with `libbls12_381.so` or `libbls12_377.so` to compile those curves instead. + +The resulting `.so` files are the compiled shared libraries for each curve. + +## Golang Binding + +The shared libraries produced from the CUDA code compilation are used to bind Golang to ICICLE's CUDA code. + +1. These shared libraries (`libbn254.so`, `libbls12_381.so`, `libbls12_377.so`) can be imported in your Go project to leverage the GPU accelerated functionalities provided by ICICLE. + +2. In your Go project, you can use `cgo` to link these shared libraries. Here's a basic example on how you can use `cgo` to link these libraries: + +```go +/* +#cgo LDFLAGS: -L/path/to/shared/libs -lbn254 -lbls12_381 -lbls12_377 +#include "icicle.h" // make sure you use the correct header file(s) +*/ +import "C" + +func main() { + // Now you can call the C functions from the ICICLE libraries. + // Note that C function calls are prefixed with 'C.' in Go code. +} +``` + +Replace `/path/to/shared/libs` with the actual path where the shared libraries are located on your system. + +## Cleaning up + +If you want to remove the compiled files, you can use the `make clean` command. This will remove the `libbn254.so`, `libbls12_381.so`, and `libbls12_377.so` files. diff --git a/goicicle/curves/bls12377/g1.go b/goicicle/curves/bls12377/g1.go new file mode 100644 index 000000000..45fe00ea7 --- /dev/null +++ b/goicicle/curves/bls12377/g1.go @@ -0,0 +1,505 @@ + + // Copyright 2023 Ingonyama + // + // Licensed under the Apache License, Version 2.0 (the "License"); + // you may not use this file except in compliance with the License. + // You may obtain a copy of the License at + // + // http://www.apache.org/licenses/LICENSE-2.0 + // + // Unless required by applicable law or agreed to in writing, software + // distributed under the License is distributed on an "AS IS" BASIS, + // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + // See the License for the specific language governing permissions and + // limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +package bls12377 + +import ( + "unsafe" + + "encoding/binary" + "fmt" + + + + + "github.com/consensys/gnark-crypto/ecc/bls12-377" + + + + + + "github.com/consensys/gnark-crypto/ecc/bls12-377/fp" + + + + + + + "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" + + + +) + +// #cgo CFLAGS: -I${SRCDIR}/icicle/curves/bls12377/ +// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn12_377 +// #include "c_api.h" +// #include "ve_mod_mult.h" +import "C" + +const SCALAR_SIZE = 8 +const BASE_SIZE = 12 + +type ScalarField struct { + s [SCALAR_SIZE]uint32 +} + +type BaseField struct { + s [BASE_SIZE]uint32 +} + +type Field interface { + toGnarkFr() *fr.Element +} + +/* + * Common Constrctors + */ + +func NewFieldZero[T BaseField | ScalarField]() *T { + var field T + + return &field +} + +func NewFieldFromFrGnark[T BaseField | ScalarField](element fr.Element) *T { + s := ConvertUint64ArrToUint32Arr(element.Bits()) // get non-montgomry + + return &T{s} +} + +func NewFieldFromFpGnark[T BaseField | ScalarField](element fp.Element) *T { + s := ConvertUint64ArrToUint32Arr(element.Bits()) // get non-montgomry + + return &T{s} +} + +/* + * BaseField Constrctors + */ + +func NewBaseFieldOne() *BaseField { + var s [BASE_SIZE]uint32 + + s[0] = 1 + + return &BaseField{s} +} + +func BaseFieldFromLimbs(limbs [BASE_SIZE]uint32) *BaseField { + bf := NewFieldZero[BaseField]() + copy(bf.s[:], limbs[:]) + + return bf +} + +/* + * BaseField methods + */ + +func (f *BaseField) limbs() [BASE_SIZE]uint32 { + return f.s +} + +func (f *BaseField) toBytesLe() []byte { + bytes := make([]byte, len(f.s)*4) + for i, v := range f.s { + binary.LittleEndian.PutUint32(bytes[i*4:], v) + } + + return bytes +} + +func (f *BaseField) toGnarkFr() *fr.Element { + fb := f.toBytesLe() + var b32 [32]byte + copy(b32[:], fb[:32]) + + v, e := fr.LittleEndian.Element(&b32) + + if e != nil { + panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e)) + } + + return &v +} + +func (f *BaseField) toGnarkFp() *fp.Element { + fb := f.toBytesLe() + var b32 [32]byte + copy(b32[:], fb[:32]) + + v, e := fp.LittleEndian.Element(&b32) + + if e != nil { + panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e)) + } + + return &v +} + +/* + * ScalarField methods + */ + +func NewScalarFieldOne() *ScalarField { + var s [SCALAR_SIZE]uint32 + + s[0] = 1 + + return &ScalarField{s} +} + +/* + * ScalarField methods + */ + +func (f *ScalarField) limbs() [SCALAR_SIZE]uint32 { + return f.s +} + +func (f *ScalarField) toBytesLe() []byte { + bytes := make([]byte, len(f.s)*4) + for i, v := range f.s { + binary.LittleEndian.PutUint32(bytes[i*4:], v) + } + + return bytes +} + +func (f ScalarField) toGnarkFr() *fr.Element { + fb := f.toBytesLe() + var b32 [32]byte + copy(b32[:], fb[:32]) + + v, e := fr.LittleEndian.Element(&b32) + + if e != nil { + panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e)) + } + + return &v +} + +func (f *ScalarField) toGnarkFp() *fp.Element { + fb := f.toBytesLe() + var b32 [32]byte + copy(b32[:], fb[:32]) + + v, e := fp.LittleEndian.Element(&b32) + + if e != nil { + panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e)) + } + + return &v +} + +/* + * PointBLS12377 + */ + +type PointBLS12377 struct { + x, y, z BaseField +} + +func NewPointBLS12377Zero() *PointBLS12377 { + return &PointBLS12377{ + x: *NewFieldZero[BaseField](), + y: *NewBaseFieldOne(), + z: *NewFieldZero[BaseField](), + } +} + +func (p *PointBLS12377) eq(pCompare *PointBLS12377) bool { + // Cast *PointBLS12377 to *C.BLS12377_projective_t + // The unsafe.Pointer cast is necessary because Go doesn't allow direct casts + // between different pointer types. + // It's your responsibility to ensure that the types are compatible. + pC := (*C.BLS12377_projective_t)(unsafe.Pointer(p)) + pCompareC := (*C.BLS12377_projective_t)(unsafe.Pointer(pCompare)) + + // Call the C function + // The C function doesn't keep any references to the data, + // so it's fine if the Go garbage collector moves or deletes the data later. + return bool(C.eq_bls12377(pC, pCompareC)) +} + +func (p *PointBLS12377) strip_z() *PointAffineNoInfinityBLS12377 { + return &PointAffineNoInfinityBLS12377{ + x: p.x, + y: p.y, + } +} + +func (p *PointBLS12377) toGnarkAffine() *bls12377.G1Affine { + px := p.x.toGnarkFp() + py := p.y.toGnarkFp() + pz := p.z.toGnarkFp() + + zInv := new(fp.Element) + x := new(fp.Element) + y := new(fp.Element) + + zInv.Inverse(pz) + + x.Mul(px, zInv) + y.Mul(py, zInv) + + return &bls12377.G1Affine{X: *x, Y: *y} +} + +func (p *PointBLS12377) ToGnarkJac() *bls12377.G1Jac { + var p1 bls12377.G1Jac + p1.FromAffine(p.toGnarkAffine()) + + return &p1 +} + +func PointBLS12377FromG1AffineGnark(gnark *bls12377.G1Affine) *PointBLS12377 { + point := PointBLS12377{ + x: *NewFieldFromFpGnark[BaseField](gnark.X), + y: *NewFieldFromFpGnark[BaseField](gnark.Y), + z: *NewBaseFieldOne(), + } + + return &point +} + +// converts jac fromat to projective +func PointBLS12377FromJacGnark(gnark *bls12377.G1Jac) *PointBLS12377 { + var pointAffine bls12377.G1Affine + pointAffine.FromJacobian(gnark) + + point := PointBLS12377{ + x: *NewFieldFromFpGnark[BaseField](pointAffine.X), + y: *NewFieldFromFpGnark[BaseField](pointAffine.Y), + z: *NewBaseFieldOne(), + } + + return &point +} + +func PointBLS12377fromLimbs(x, y, z *[]uint32) *PointBLS12377 { + return &PointBLS12377{ + x: *BaseFieldFromLimbs(getFixedLimbs(x)), + y: *BaseFieldFromLimbs(getFixedLimbs(y)), + z: *BaseFieldFromLimbs(getFixedLimbs(z)), + } +} + +/* + * PointAffineNoInfinityBLS12377 + */ + +type PointAffineNoInfinityBLS12377 struct { + x, y BaseField +} + +func NewPointAffineNoInfinityBLS12377Zero() *PointAffineNoInfinityBLS12377 { + return &PointAffineNoInfinityBLS12377{ + x: *NewFieldZero[BaseField](), + y: *NewFieldZero[BaseField](), + } +} + +func (p *PointAffineNoInfinityBLS12377) toProjective() *PointBLS12377 { + return &PointBLS12377{ + x: p.x, + y: p.y, + z: *NewBaseFieldOne(), + } +} + +func (p *PointAffineNoInfinityBLS12377) toGnarkAffine() *bls12377.G1Affine { + return p.toProjective().toGnarkAffine() +} + +func PointAffineNoInfinityBLS12377FromLimbs(x, y *[]uint32) *PointAffineNoInfinityBLS12377 { + return &PointAffineNoInfinityBLS12377{ + x: *BaseFieldFromLimbs(getFixedLimbs(x)), + y: *BaseFieldFromLimbs(getFixedLimbs(y)), + } +} + +/* + * Multiplication + */ + +func MultiplyVec(a []PointBLS12377, b []ScalarField, deviceID int) { + if len(a) != len(b) { + panic("a and b have different lengths") + } + + pointsC := (*C.BLS12377_projective_t)(unsafe.Pointer(&a[0])) + scalarsC := (*C.BLS12377_scalar_t)(unsafe.Pointer(&b[0])) + deviceIdC := C.size_t(deviceID) + nElementsC := C.size_t(len(a)) + + C.vec_mod_mult_point_bls12377(pointsC, scalarsC, nElementsC, deviceIdC) +} + +func MultiplyScalar(a []ScalarField, b []ScalarField, deviceID int) { + if len(a) != len(b) { + panic("a and b have different lengths") + } + + aC := (*C.BLS12377_scalar_t)(unsafe.Pointer(&a[0])) + bC := (*C.BLS12377_scalar_t)(unsafe.Pointer(&b[0])) + deviceIdC := C.size_t(deviceID) + nElementsC := C.size_t(len(a)) + + C.vec_mod_mult_scalar_bls12377(aC, bC, nElementsC, deviceIdC) +} + +// Multiply a matrix by a scalar: +// +// `a` - flattenned matrix; +// `b` - vector to multiply `a` by; +func MultiplyMatrix(a []ScalarField, b []ScalarField, deviceID int) { + c := make([]ScalarField, len(b)) + for i := range c { + c[i] = *NewFieldZero[ScalarField]() + } + + aC := (*C.BLS12377_scalar_t)(unsafe.Pointer(&a[0])) + bC := (*C.BLS12377_scalar_t)(unsafe.Pointer(&b[0])) + cC := (*C.BLS12377_scalar_t)(unsafe.Pointer(&c[0])) + deviceIdC := C.size_t(deviceID) + nElementsC := C.size_t(len(a)) + + C.matrix_vec_mod_mult_bls12377(aC, bC, cC, nElementsC, deviceIdC) +} + +/* + * Utils + */ + +func getFixedLimbs(slice *[]uint32) [BASE_SIZE]uint32 { + if len(*slice) <= BASE_SIZE { + limbs := [BASE_SIZE]uint32{} + copy(limbs[:len(*slice)], *slice) + return limbs + } + + panic("slice has too many elements") +} + +func BatchConvertFromFrGnark[T BaseField | ScalarField](elements []fr.Element) []T { + var newElements []T + for _, e := range elements { + converted := NewFieldFromFrGnark[T](e) + newElements = append(newElements, *converted) + } + + return newElements +} + +func BatchConvertFromFrGnarkThreaded[T BaseField | ScalarField](elements []fr.Element, routines int) []T { + var newElements []T + + if routines > 1 { + channels := make([]chan []T, routines) + for i := 0; i < routines; i ++ { + channels[i] = make(chan []T, 1) + } + + convert := func(elements []fr.Element, chanIndex int) { + var convertedElements []T + for _, e := range elements { + converted := NewFieldFromFrGnark[T](e) + convertedElements = append(convertedElements, *converted) + } + + channels[chanIndex] <- convertedElements + } + + batchLen := len(elements)/routines + for i := 0; i < routines; i ++ { + elemsToConv := elements[batchLen*i:batchLen*(i+1)] + go convert(elemsToConv, i) + } + + for i := 0; i < routines; i ++ { + newElements = append(newElements, <-channels[i]...) + } + } else { + for _, e := range elements { + converted := NewFieldFromFrGnark[T](e) + newElements = append(newElements, *converted) + } + } + + return newElements +} + +func BatchConvertToFrGnark[T Field](elements []T) []fr.Element { + var newElements []fr.Element + for _, e := range elements { + converted := e.toGnarkFr() + newElements = append(newElements, *converted) + } + + return newElements +} + +func BatchConvertToFrGnarkThreaded[T Field](elements []T, routines int) []fr.Element { + var newElements []fr.Element + + if routines > 1 { + channels := make([]chan []fr.Element, routines) + for i := 0; i < routines; i ++ { + channels[i] = make(chan []fr.Element, 1) + } + + convert := func(elements []T, chanIndex int) { + var convertedElements []fr.Element + for _, e := range elements { + converted := e.toGnarkFr() + convertedElements = append(convertedElements, *converted) + } + + channels[chanIndex] <- convertedElements + } + + batchLen := len(elements)/routines + for i := 0; i < routines; i ++ { + elemsToConv := elements[batchLen*i:batchLen*(i+1)] + go convert(elemsToConv, i) + } + + for i := 0; i < routines; i ++ { + newElements = append(newElements, <-channels[i]...) + } + } else { + for _, e := range elements { + converted := e.toGnarkFr() + newElements = append(newElements, *converted) + } + } + + return newElements +} + +func BatchConvertFromG1Affine(elements []bls12377.G1Affine) []PointAffineNoInfinityBLS12377 { + var newElements []PointAffineNoInfinityBLS12377 + for _, e := range elements { + newElement := PointBLS12377FromG1AffineGnark(&e).strip_z() + newElements = append(newElements, *newElement) + } + return newElements +} diff --git a/goicicle/curves/bls12377/g2.go b/goicicle/curves/bls12377/g2.go new file mode 100644 index 000000000..7541f9899 --- /dev/null +++ b/goicicle/curves/bls12377/g2.go @@ -0,0 +1,109 @@ +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +package bls12377 + +import ( + "unsafe" + + + + "github.com/consensys/gnark-crypto/ecc/bls12-377" + + + +) + +// #cgo CFLAGS: -I${SRCDIR}/icicle/curves/bls12377/ +// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn12_377 +// #include "c_api.h" +// #include "ve_mod_mult.h" +import "C" + +func BatchConvertFromG2Affine(elements []bls12377.G2Affine) []G2PointAffine { + var newElements []G2PointAffine + for _, gg2Affine := range elements { + var newElement G2PointAffine + newElement.FromGnarkAffine(&gg2Affine) + + newElements = append(newElements, newElement) + } + return newElements +} + +// G2 extension field + +type G2Element [4]uint64 + +type ExtentionField struct { + A0, A1 G2Element +} + +type G2PointAffine struct { + x, y ExtentionField +} + +type G2Point struct { + x, y, z ExtentionField +} + +func (p *G2Point) eqg2(pCompare *G2Point) bool { + // Cast *PointBLS12377 to *C.BLS12377_projective_t + // The unsafe.Pointer cast is necessary because Go doesn't allow direct casts + // between different pointer types. + // It's your responsibility to ensure that the types are compatible. + pC := (*C.BLS12377_g2_projective_t)(unsafe.Pointer(p)) + pCompareC := (*C.BLS12377_g2_projective_t)(unsafe.Pointer(pCompare)) + + // Call the C function + // The C function doesn't keep any references to the data, + // so it's fine if the Go garbage collector moves or deletes the data later. + return bool(C.eq_g2_bls12377(pC, pCompareC)) +} + +func (p *G2PointAffine) ToProjective() G2Point { + return G2Point{ + x: p.x, + y: p.y, + z: ExtentionField{ + A0: G2Element{1, 0, 0, 0}, + A1: G2Element{0, 0, 0, 0}, + }, + } +} + +func (g *G2PointAffine) FromGnarkAffine(gnark *bls12377.G2Affine) *G2PointAffine { + // Bits() returns non montgomery format + g.x.A0 = gnark.X.A0.Bits() + g.x.A1 = gnark.X.A1.Bits() + g.y.A0 = gnark.Y.A0.Bits() + g.y.A1 = gnark.Y.A1.Bits() + + return g +} + +func (g *G2PointAffine) FromGnarkJac(gnark *bls12377.G2Jac) *G2PointAffine { + var pointAffine bls12377.G2Affine + pointAffine.FromJacobian(gnark) + + // Bits() returns non montgomery format + g.x.A0 = pointAffine.X.A0.Bits() + g.x.A1 = pointAffine.X.A1.Bits() + g.y.A0 = pointAffine.Y.A0.Bits() + g.y.A1 = pointAffine.Y.A1.Bits() + + return g +} diff --git a/goicicle/curves/bls12377/msm.go b/goicicle/curves/bls12377/msm.go new file mode 100644 index 000000000..4f476e13b --- /dev/null +++ b/goicicle/curves/bls12377/msm.go @@ -0,0 +1,90 @@ + + // Copyright 2023 Ingonyama + // + // Licensed under the Apache License, Version 2.0 (the "License"); + // you may not use this file except in compliance with the License. + // You may obtain a copy of the License at + // + // http://www.apache.org/licenses/LICENSE-2.0 + // + // Unless required by applicable law or agreed to in writing, software + // distributed under the License is distributed on an "AS IS" BASIS, + // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + // See the License for the specific language governing permissions and + // limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +package bls12377 + +import ( + "errors" + "fmt" + "unsafe" +) + +// #cgo CFLAGS: -I../../../icicle/curves/bls12_377 +// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn12_377 +// #include "msm.h" +import "C" + +func MsmBLS12377(points []PointAffineNoInfinityBLS12377, scalars []ScalarField, device_id int) (*PointBLS12377, error) { + if len(points) != len(scalars) { + return nil, errors.New("error on: len(points) != len(scalars)") + } + + out := new(PointBLS12377) + + pointsC := (*C.BLS12377_affine_t)(unsafe.Pointer(&points[0])) + scalarsC := (*C.BLS12377_scalar_t)(unsafe.Pointer(&scalars[0])) + outC := (*C.BLS12377_projective_t)(unsafe.Pointer(out)) + + ret := C.msm_cuda_bls12_377(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id)) + + if ret != 0 { + return nil, fmt.Errorf("msm_cuda_bls12_377 returned error code: %d", ret) + } + + return out, nil +} + +func MsmBatchBLS12377(points *[]PointAffineNoInfinityBLS12377, scalars *[]ScalarField, batchSize, deviceId int) ([]*PointBLS12377, error) { + // Check for nil pointers + if points == nil || scalars == nil { + return nil, errors.New("points or scalars is nil") + } + + if len(*points) != len(*scalars) { + return nil, errors.New("error on: len(points) != len(scalars)") + } + + // Check for empty slices + if len(*points) == 0 || len(*scalars) == 0 { + return nil, errors.New("points or scalars is empty") + } + + // Check for zero batchSize + if batchSize <= 0 { + return nil, errors.New("error on: batchSize must be greater than zero") + } + + out := make([]*PointBLS12377, batchSize) + + for i := 0; i < len(out); i++ { + out[i] = NewPointBLS12377Zero() + } + + outC := (*C.BLS12377_projective_t)(unsafe.Pointer(&out[0])) + pointsC := (*C.BLS12377_affine_t)(unsafe.Pointer(&(*points)[0])) + scalarsC := (*C.BLS12377_scalar_t)(unsafe.Pointer(&(*scalars)[0])) + msmSizeC := C.size_t(len(*points) / batchSize) + deviceIdC := C.size_t(deviceId) + batchSizeC := C.size_t(batchSize) + + ret := C.msm_batch_cuda_bls12_377(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC) + if ret != 0 { + return nil, fmt.Errorf("msm_batch_cuda_bls12_377 returned error code: %d", ret) + } + + return out, nil +} diff --git a/goicicle/curves/bls12377/ntt.go b/goicicle/curves/bls12377/ntt.go new file mode 100644 index 000000000..af10045b1 --- /dev/null +++ b/goicicle/curves/bls12377/ntt.go @@ -0,0 +1,73 @@ + + // Copyright 2023 Ingonyama + // + // Licensed under the Apache License, Version 2.0 (the "License"); + // you may not use this file except in compliance with the License. + // You may obtain a copy of the License at + // + // http://www.apache.org/licenses/LICENSE-2.0 + // + // Unless required by applicable law or agreed to in writing, software + // distributed under the License is distributed on an "AS IS" BASIS, + // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + // See the License for the specific language governing permissions and + // limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +package bls12377 + + +// #cgo CFLAGS: -I../../../icicle/curves/bls12_377/ +// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn12_377 +// #include "ntt.h" +import "C" +import "unsafe" + +const ( + NONE = 0 + DIF = 1 + DIT = 2 +) + +func NttBLS12377(scalars *[]ScalarField, isInverse bool, decimation int, deviceId int) uint64 { + scalarsC := (*C.BLS12377_scalar_t)(unsafe.Pointer(&(*scalars)[0])) + + ret := C.ntt_cuda_bls12_377(scalarsC, C.uint32_t(len(*scalars)), C.bool(isInverse), C.size_t(decimation), C.size_t(deviceId)) + + return uint64(ret) +} + +func NttBatchBLS12377(scalars *[]ScalarField, isInverse bool, batchSize, deviceId int) uint64 { + scalarsC := (*C.BLS12377_scalar_t)(unsafe.Pointer(&(*scalars)[0])) + isInverseC := C.bool(isInverse) + batchSizeC := C.uint32_t(batchSize) + deviceIdC := C.size_t(deviceId) + + ret := C.ntt_batch_cuda_bls12_377(scalarsC, C.uint32_t(len(*scalars)), batchSizeC, isInverseC, deviceIdC) + + return uint64(ret) +} + +func EcNttBLS12377(values *[]PointBLS12377, isInverse bool, deviceId int) uint64 { + valuesC := (*C.BLS12377_projective_t)(unsafe.Pointer(&(*values)[0])) + deviceIdC := C.size_t(deviceId) + isInverseC := C.bool(isInverse) + n := C.uint32_t(len(*values)) + + ret := C.ecntt_cuda_bls12_377(valuesC, n, isInverseC, deviceIdC) + + return uint64(ret) +} + +func EcNttBatchBLS12377(values *[]PointBLS12377, isInverse bool, batchSize, deviceId int) uint64 { + valuesC := (*C.BLS12377_projective_t)(unsafe.Pointer(&(*values)[0])) + deviceIdC := C.size_t(deviceId) + isInverseC := C.bool(isInverse) + n := C.uint32_t(len(*values)) + batchSizeC := C.uint32_t(batchSize) + + ret := C.ecntt_batch_cuda_bls12_377(valuesC, n, batchSizeC, isInverseC, deviceIdC) + + return uint64(ret) +} diff --git a/goicicle/curves/bls12377/utils.go b/goicicle/curves/bls12377/utils.go new file mode 100644 index 000000000..49cf4effe --- /dev/null +++ b/goicicle/curves/bls12377/utils.go @@ -0,0 +1,25 @@ +package bls12377 + +import "encoding/binary" + +// Function to convert [8]uint32 to [4]uint64 +func ConvertUint32ArrToUint64Arr(arr32 [8]uint32) [4]uint64 { + var arr64 [4]uint64 + for i := 0; i < len(arr32); i += 2 { + arr64[i/2] = (uint64(arr32[i]) << 32) | uint64(arr32[i+1]) + } + return arr64 +} + +func ConvertUint64ArrToUint32Arr(arr64 [4]uint64) [8]uint32 { + var arr32 [8]uint32 + for i, v := range arr64 { + b := make([]byte, 8) + binary.LittleEndian.PutUint64(b, v) + + arr32[i*2] = binary.LittleEndian.Uint32(b[0:4]) + arr32[i*2+1] = binary.LittleEndian.Uint32(b[4:8]) + } + + return arr32 +} diff --git a/goicicle/curves/bls12377/utils_test.go b/goicicle/curves/bls12377/utils_test.go new file mode 100644 index 000000000..c18ddbf26 --- /dev/null +++ b/goicicle/curves/bls12377/utils_test.go @@ -0,0 +1,81 @@ +package bls12377 + +import ( + "testing" +) + +func TestConvertUint32ArrToUint64Arr(t *testing.T) { + testCases := []struct { + name string + input [8]uint32 + want [4]uint64 + }{ + { + name: "Test with incremental array", + input: [8]uint32{1, 2, 3, 4, 5, 6, 7, 8}, + want: [4]uint64{4294967298, 12884901892, 21474836486, 30064771080}, + }, + { + name: "Test with all zeros", + input: [8]uint32{0, 0, 0, 0, 0, 0, 0, 0}, + want: [4]uint64{0, 0, 0, 0}, + }, + { + name: "Test with maximum uint32 values", + input: [8]uint32{4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295}, + want: [4]uint64{18446744073709551615, 18446744073709551615, 18446744073709551615, 18446744073709551615}, + }, + { + name: "Test with alternating min and max uint32 values", + input: [8]uint32{0, 4294967295, 0, 4294967295, 0, 4294967295, 0, 4294967295}, + want: [4]uint64{4294967295, 4294967295, 4294967295, 4294967295}, + }, + { + name: "Test with alternating max and min uint32 values", + input: [8]uint32{4294967295, 0, 4294967295, 0, 4294967295, 0, 4294967295, 0}, + want: [4]uint64{18446744069414584320, 18446744069414584320, 18446744069414584320, 18446744069414584320}, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + got := ConvertUint32ArrToUint64Arr(tc.input) + if got != tc.want { + t.Errorf("got %v, want %v", got, tc.want) + } + }) + } +} + +func TestConvertUint64ArrToUint32Arr(t *testing.T) { + testCases := []struct { + name string + input [4]uint64 + expected [8]uint32 + }{ + { + name: "test one", + input: [4]uint64{1, 2, 3, 4}, + expected: [8]uint32{1, 0, 2, 0, 3, 0, 4, 0}, + }, + { + name: "test two", + input: [4]uint64{100, 200, 300, 400}, + expected: [8]uint32{100, 0, 200, 0, 300, 0, 400, 0}, + }, + { + name: "test three", + input: [4]uint64{1000, 2000, 3000, 4000}, + expected: [8]uint32{1000, 0, 2000, 0, 3000, 0, 4000, 0}, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + got := ConvertUint64ArrToUint32Arr(tc.input) + if got != tc.expected { + t.Errorf("got %v, want %v", got, tc.expected) + } + }) + } +} diff --git a/goicicle/curves/bls12381/g1.go b/goicicle/curves/bls12381/g1.go new file mode 100644 index 000000000..f2a159655 --- /dev/null +++ b/goicicle/curves/bls12381/g1.go @@ -0,0 +1,505 @@ + + // Copyright 2023 Ingonyama + // + // Licensed under the Apache License, Version 2.0 (the "License"); + // you may not use this file except in compliance with the License. + // You may obtain a copy of the License at + // + // http://www.apache.org/licenses/LICENSE-2.0 + // + // Unless required by applicable law or agreed to in writing, software + // distributed under the License is distributed on an "AS IS" BASIS, + // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + // See the License for the specific language governing permissions and + // limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +package bls12381 + +import ( + "unsafe" + + "encoding/binary" + "fmt" + + + + + "github.com/consensys/gnark-crypto/ecc/bls12-381" + + + + + + "github.com/consensys/gnark-crypto/ecc/bls12-381/fp" + + + + + + + "github.com/consensys/gnark-crypto/ecc/bls12-381/fr" + + + +) + +// #cgo CFLAGS: -I${SRCDIR}/icicle/curves/bls12381/ +// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn12_381 +// #include "c_api.h" +// #include "ve_mod_mult.h" +import "C" + +const SCALAR_SIZE = 8 +const BASE_SIZE = 12 + +type ScalarField struct { + s [SCALAR_SIZE]uint32 +} + +type BaseField struct { + s [BASE_SIZE]uint32 +} + +type Field interface { + toGnarkFr() *fr.Element +} + +/* + * Common Constrctors + */ + +func NewFieldZero[T BaseField | ScalarField]() *T { + var field T + + return &field +} + +func NewFieldFromFrGnark[T BaseField | ScalarField](element fr.Element) *T { + s := ConvertUint64ArrToUint32Arr(element.Bits()) // get non-montgomry + + return &T{s} +} + +func NewFieldFromFpGnark[T BaseField | ScalarField](element fp.Element) *T { + s := ConvertUint64ArrToUint32Arr(element.Bits()) // get non-montgomry + + return &T{s} +} + +/* + * BaseField Constrctors + */ + +func NewBaseFieldOne() *BaseField { + var s [BASE_SIZE]uint32 + + s[0] = 1 + + return &BaseField{s} +} + +func BaseFieldFromLimbs(limbs [BASE_SIZE]uint32) *BaseField { + bf := NewFieldZero[BaseField]() + copy(bf.s[:], limbs[:]) + + return bf +} + +/* + * BaseField methods + */ + +func (f *BaseField) limbs() [BASE_SIZE]uint32 { + return f.s +} + +func (f *BaseField) toBytesLe() []byte { + bytes := make([]byte, len(f.s)*4) + for i, v := range f.s { + binary.LittleEndian.PutUint32(bytes[i*4:], v) + } + + return bytes +} + +func (f *BaseField) toGnarkFr() *fr.Element { + fb := f.toBytesLe() + var b32 [32]byte + copy(b32[:], fb[:32]) + + v, e := fr.LittleEndian.Element(&b32) + + if e != nil { + panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e)) + } + + return &v +} + +func (f *BaseField) toGnarkFp() *fp.Element { + fb := f.toBytesLe() + var b32 [32]byte + copy(b32[:], fb[:32]) + + v, e := fp.LittleEndian.Element(&b32) + + if e != nil { + panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e)) + } + + return &v +} + +/* + * ScalarField methods + */ + +func NewScalarFieldOne() *ScalarField { + var s [SCALAR_SIZE]uint32 + + s[0] = 1 + + return &ScalarField{s} +} + +/* + * ScalarField methods + */ + +func (f *ScalarField) limbs() [SCALAR_SIZE]uint32 { + return f.s +} + +func (f *ScalarField) toBytesLe() []byte { + bytes := make([]byte, len(f.s)*4) + for i, v := range f.s { + binary.LittleEndian.PutUint32(bytes[i*4:], v) + } + + return bytes +} + +func (f ScalarField) toGnarkFr() *fr.Element { + fb := f.toBytesLe() + var b32 [32]byte + copy(b32[:], fb[:32]) + + v, e := fr.LittleEndian.Element(&b32) + + if e != nil { + panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e)) + } + + return &v +} + +func (f *ScalarField) toGnarkFp() *fp.Element { + fb := f.toBytesLe() + var b32 [32]byte + copy(b32[:], fb[:32]) + + v, e := fp.LittleEndian.Element(&b32) + + if e != nil { + panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e)) + } + + return &v +} + +/* + * PointBLS12381 + */ + +type PointBLS12381 struct { + x, y, z BaseField +} + +func NewPointBLS12381Zero() *PointBLS12381 { + return &PointBLS12381{ + x: *NewFieldZero[BaseField](), + y: *NewBaseFieldOne(), + z: *NewFieldZero[BaseField](), + } +} + +func (p *PointBLS12381) eq(pCompare *PointBLS12381) bool { + // Cast *PointBLS12381 to *C.BLS12381_projective_t + // The unsafe.Pointer cast is necessary because Go doesn't allow direct casts + // between different pointer types. + // It's your responsibility to ensure that the types are compatible. + pC := (*C.BLS12381_projective_t)(unsafe.Pointer(p)) + pCompareC := (*C.BLS12381_projective_t)(unsafe.Pointer(pCompare)) + + // Call the C function + // The C function doesn't keep any references to the data, + // so it's fine if the Go garbage collector moves or deletes the data later. + return bool(C.eq_bls12381(pC, pCompareC)) +} + +func (p *PointBLS12381) strip_z() *PointAffineNoInfinityBLS12381 { + return &PointAffineNoInfinityBLS12381{ + x: p.x, + y: p.y, + } +} + +func (p *PointBLS12381) toGnarkAffine() *bls12381.G1Affine { + px := p.x.toGnarkFp() + py := p.y.toGnarkFp() + pz := p.z.toGnarkFp() + + zInv := new(fp.Element) + x := new(fp.Element) + y := new(fp.Element) + + zInv.Inverse(pz) + + x.Mul(px, zInv) + y.Mul(py, zInv) + + return &bls12381.G1Affine{X: *x, Y: *y} +} + +func (p *PointBLS12381) ToGnarkJac() *bls12381.G1Jac { + var p1 bls12381.G1Jac + p1.FromAffine(p.toGnarkAffine()) + + return &p1 +} + +func PointBLS12381FromG1AffineGnark(gnark *bls12381.G1Affine) *PointBLS12381 { + point := PointBLS12381{ + x: *NewFieldFromFpGnark[BaseField](gnark.X), + y: *NewFieldFromFpGnark[BaseField](gnark.Y), + z: *NewBaseFieldOne(), + } + + return &point +} + +// converts jac fromat to projective +func PointBLS12381FromJacGnark(gnark *bls12381.G1Jac) *PointBLS12381 { + var pointAffine bls12381.G1Affine + pointAffine.FromJacobian(gnark) + + point := PointBLS12381{ + x: *NewFieldFromFpGnark[BaseField](pointAffine.X), + y: *NewFieldFromFpGnark[BaseField](pointAffine.Y), + z: *NewBaseFieldOne(), + } + + return &point +} + +func PointBLS12381fromLimbs(x, y, z *[]uint32) *PointBLS12381 { + return &PointBLS12381{ + x: *BaseFieldFromLimbs(getFixedLimbs(x)), + y: *BaseFieldFromLimbs(getFixedLimbs(y)), + z: *BaseFieldFromLimbs(getFixedLimbs(z)), + } +} + +/* + * PointAffineNoInfinityBLS12381 + */ + +type PointAffineNoInfinityBLS12381 struct { + x, y BaseField +} + +func NewPointAffineNoInfinityBLS12381Zero() *PointAffineNoInfinityBLS12381 { + return &PointAffineNoInfinityBLS12381{ + x: *NewFieldZero[BaseField](), + y: *NewFieldZero[BaseField](), + } +} + +func (p *PointAffineNoInfinityBLS12381) toProjective() *PointBLS12381 { + return &PointBLS12381{ + x: p.x, + y: p.y, + z: *NewBaseFieldOne(), + } +} + +func (p *PointAffineNoInfinityBLS12381) toGnarkAffine() *bls12381.G1Affine { + return p.toProjective().toGnarkAffine() +} + +func PointAffineNoInfinityBLS12381FromLimbs(x, y *[]uint32) *PointAffineNoInfinityBLS12381 { + return &PointAffineNoInfinityBLS12381{ + x: *BaseFieldFromLimbs(getFixedLimbs(x)), + y: *BaseFieldFromLimbs(getFixedLimbs(y)), + } +} + +/* + * Multiplication + */ + +func MultiplyVec(a []PointBLS12381, b []ScalarField, deviceID int) { + if len(a) != len(b) { + panic("a and b have different lengths") + } + + pointsC := (*C.BLS12381_projective_t)(unsafe.Pointer(&a[0])) + scalarsC := (*C.BLS12381_scalar_t)(unsafe.Pointer(&b[0])) + deviceIdC := C.size_t(deviceID) + nElementsC := C.size_t(len(a)) + + C.vec_mod_mult_point_bls12381(pointsC, scalarsC, nElementsC, deviceIdC) +} + +func MultiplyScalar(a []ScalarField, b []ScalarField, deviceID int) { + if len(a) != len(b) { + panic("a and b have different lengths") + } + + aC := (*C.BLS12381_scalar_t)(unsafe.Pointer(&a[0])) + bC := (*C.BLS12381_scalar_t)(unsafe.Pointer(&b[0])) + deviceIdC := C.size_t(deviceID) + nElementsC := C.size_t(len(a)) + + C.vec_mod_mult_scalar_bls12381(aC, bC, nElementsC, deviceIdC) +} + +// Multiply a matrix by a scalar: +// +// `a` - flattenned matrix; +// `b` - vector to multiply `a` by; +func MultiplyMatrix(a []ScalarField, b []ScalarField, deviceID int) { + c := make([]ScalarField, len(b)) + for i := range c { + c[i] = *NewFieldZero[ScalarField]() + } + + aC := (*C.BLS12381_scalar_t)(unsafe.Pointer(&a[0])) + bC := (*C.BLS12381_scalar_t)(unsafe.Pointer(&b[0])) + cC := (*C.BLS12381_scalar_t)(unsafe.Pointer(&c[0])) + deviceIdC := C.size_t(deviceID) + nElementsC := C.size_t(len(a)) + + C.matrix_vec_mod_mult_bls12381(aC, bC, cC, nElementsC, deviceIdC) +} + +/* + * Utils + */ + +func getFixedLimbs(slice *[]uint32) [BASE_SIZE]uint32 { + if len(*slice) <= BASE_SIZE { + limbs := [BASE_SIZE]uint32{} + copy(limbs[:len(*slice)], *slice) + return limbs + } + + panic("slice has too many elements") +} + +func BatchConvertFromFrGnark[T BaseField | ScalarField](elements []fr.Element) []T { + var newElements []T + for _, e := range elements { + converted := NewFieldFromFrGnark[T](e) + newElements = append(newElements, *converted) + } + + return newElements +} + +func BatchConvertFromFrGnarkThreaded[T BaseField | ScalarField](elements []fr.Element, routines int) []T { + var newElements []T + + if routines > 1 { + channels := make([]chan []T, routines) + for i := 0; i < routines; i ++ { + channels[i] = make(chan []T, 1) + } + + convert := func(elements []fr.Element, chanIndex int) { + var convertedElements []T + for _, e := range elements { + converted := NewFieldFromFrGnark[T](e) + convertedElements = append(convertedElements, *converted) + } + + channels[chanIndex] <- convertedElements + } + + batchLen := len(elements)/routines + for i := 0; i < routines; i ++ { + elemsToConv := elements[batchLen*i:batchLen*(i+1)] + go convert(elemsToConv, i) + } + + for i := 0; i < routines; i ++ { + newElements = append(newElements, <-channels[i]...) + } + } else { + for _, e := range elements { + converted := NewFieldFromFrGnark[T](e) + newElements = append(newElements, *converted) + } + } + + return newElements +} + +func BatchConvertToFrGnark[T Field](elements []T) []fr.Element { + var newElements []fr.Element + for _, e := range elements { + converted := e.toGnarkFr() + newElements = append(newElements, *converted) + } + + return newElements +} + +func BatchConvertToFrGnarkThreaded[T Field](elements []T, routines int) []fr.Element { + var newElements []fr.Element + + if routines > 1 { + channels := make([]chan []fr.Element, routines) + for i := 0; i < routines; i ++ { + channels[i] = make(chan []fr.Element, 1) + } + + convert := func(elements []T, chanIndex int) { + var convertedElements []fr.Element + for _, e := range elements { + converted := e.toGnarkFr() + convertedElements = append(convertedElements, *converted) + } + + channels[chanIndex] <- convertedElements + } + + batchLen := len(elements)/routines + for i := 0; i < routines; i ++ { + elemsToConv := elements[batchLen*i:batchLen*(i+1)] + go convert(elemsToConv, i) + } + + for i := 0; i < routines; i ++ { + newElements = append(newElements, <-channels[i]...) + } + } else { + for _, e := range elements { + converted := e.toGnarkFr() + newElements = append(newElements, *converted) + } + } + + return newElements +} + +func BatchConvertFromG1Affine(elements []bls12381.G1Affine) []PointAffineNoInfinityBLS12381 { + var newElements []PointAffineNoInfinityBLS12381 + for _, e := range elements { + newElement := PointBLS12381FromG1AffineGnark(&e).strip_z() + newElements = append(newElements, *newElement) + } + return newElements +} diff --git a/goicicle/curves/bls12381/g2.go b/goicicle/curves/bls12381/g2.go new file mode 100644 index 000000000..1b15713f2 --- /dev/null +++ b/goicicle/curves/bls12381/g2.go @@ -0,0 +1,108 @@ + + // Copyright 2023 Ingonyama + // + // Licensed under the Apache License, Version 2.0 (the "License"); + // you may not use this file except in compliance with the License. + // You may obtain a copy of the License at + // + // http://www.apache.org/licenses/LICENSE-2.0 + // + // Unless required by applicable law or agreed to in writing, software + // distributed under the License is distributed on an "AS IS" BASIS, + // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + // See the License for the specific language governing permissions and + // limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +package bls12381 + +import ( + "unsafe" + + + + "github.com/consensys/gnark-crypto/ecc/bls12-381" + + + +) + +// #cgo CFLAGS: -I${SRCDIR}/icicle/curves/bls12381/ +// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn12_381 +// #include "c_api.h" +// #include "ve_mod_mult.h" +import "C" + +func BatchConvertFromG2Affine(elements []bls12381.G2Affine) []G2PointAffine { + var newElements []G2PointAffine + for _, gg2Affine := range elements { + var newElement G2PointAffine + newElement.FromGnarkAffine(&gg2Affine) + + newElements = append(newElements, newElement) + } + return newElements +} + +// G2 extension field + +type G2Element [4]uint64 + +type ExtentionField struct { + A0, A1 G2Element +} + +type G2PointAffine struct { + x, y ExtentionField +} + +type G2Point struct { + x, y, z ExtentionField +} + +func (p *G2Point) eqg2(pCompare *G2Point) bool { + // Cast *PointBLS12381 to *C.BLS12381_projective_t + // The unsafe.Pointer cast is necessary because Go doesn't allow direct casts + // between different pointer types. + // It's your responsibility to ensure that the types are compatible. + pC := (*C.BLS12381_g2_projective_t)(unsafe.Pointer(p)) + pCompareC := (*C.BLS12381_g2_projective_t)(unsafe.Pointer(pCompare)) + + // Call the C function + // The C function doesn't keep any references to the data, + // so it's fine if the Go garbage collector moves or deletes the data later. + return bool(C.eq_g2_bls12381(pC, pCompareC)) +} + +func (p *G2PointAffine) ToProjective() G2Point { + return G2Point{ + x: p.x, + y: p.y, + z: ExtentionField{ + A0: G2Element{1, 0, 0, 0}, + A1: G2Element{0, 0, 0, 0}, + }, + } +} + +func (g *G2PointAffine) FromGnarkAffine(gnark *bls12381.G2Affine) *G2PointAffine { + g.x.A0 = gnark.X.A0.Bits() + g.x.A1 = gnark.X.A1.Bits() + g.y.A0 = gnark.Y.A0.Bits() + g.y.A1 = gnark.Y.A1.Bits() + + return g +} + +func (g *G2PointAffine) FromGnarkJac(gnark *bls12381.G2Jac) *G2PointAffine { + var pointAffine bls12381.G2Affine + pointAffine.FromJacobian(gnark) + + g.x.A0 = pointAffine.X.A0.Bits() + g.x.A1 = pointAffine.X.A1.Bits() + g.y.A0 = pointAffine.Y.A0.Bits() + g.y.A1 = pointAffine.Y.A1.Bits() + + return g +} diff --git a/goicicle/curves/bls12381/msm.go b/goicicle/curves/bls12381/msm.go new file mode 100644 index 000000000..da98ecb79 --- /dev/null +++ b/goicicle/curves/bls12381/msm.go @@ -0,0 +1,90 @@ + + // Copyright 2023 Ingonyama + // + // Licensed under the Apache License, Version 2.0 (the "License"); + // you may not use this file except in compliance with the License. + // You may obtain a copy of the License at + // + // http://www.apache.org/licenses/LICENSE-2.0 + // + // Unless required by applicable law or agreed to in writing, software + // distributed under the License is distributed on an "AS IS" BASIS, + // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + // See the License for the specific language governing permissions and + // limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +package bls12381 + +import ( + "errors" + "fmt" + "unsafe" +) + +// #cgo CFLAGS: -I../../../icicle/curves/bls12_381/ +// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn12_381 +// #include "msm.h" +import "C" + +func MsmBLS12381(points []PointAffineNoInfinityBLS12381, scalars []ScalarField, device_id int) (*PointBLS12381, error) { + if len(points) != len(scalars) { + return nil, errors.New("error on: len(points) != len(scalars)") + } + + out := new(PointBLS12381) + + pointsC := (*C.BLS12381_affine_t)(unsafe.Pointer(&points[0])) + scalarsC := (*C.BLS12381_scalar_t)(unsafe.Pointer(&scalars[0])) + outC := (*C.BLS12381_projective_t)(unsafe.Pointer(out)) + + ret := C.msm_cuda_bls12_381(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id)) + + if ret != 0 { + return nil, fmt.Errorf("msm_cuda_bls12_381 returned error code: %d", ret) + } + + return out, nil +} + +func MsmBatchBLS12381(points *[]PointAffineNoInfinityBLS12381, scalars *[]ScalarField, batchSize, deviceId int) ([]*PointBLS12381, error) { + // Check for nil pointers + if points == nil || scalars == nil { + return nil, errors.New("points or scalars is nil") + } + + if len(*points) != len(*scalars) { + return nil, errors.New("error on: len(points) != len(scalars)") + } + + // Check for empty slices + if len(*points) == 0 || len(*scalars) == 0 { + return nil, errors.New("points or scalars is empty") + } + + // Check for zero batchSize + if batchSize <= 0 { + return nil, errors.New("error on: batchSize must be greater than zero") + } + + out := make([]*PointBLS12381, batchSize) + + for i := 0; i < len(out); i++ { + out[i] = NewPointBLS12381Zero() + } + + outC := (*C.BLS12381_projective_t)(unsafe.Pointer(&out[0])) + pointsC := (*C.BLS12381_affine_t)(unsafe.Pointer(&(*points)[0])) + scalarsC := (*C.BLS12381_scalar_t)(unsafe.Pointer(&(*scalars)[0])) + msmSizeC := C.size_t(len(*points) / batchSize) + deviceIdC := C.size_t(deviceId) + batchSizeC := C.size_t(batchSize) + + ret := C.msm_batch_cuda_bls12_381(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC) + if ret != 0 { + return nil, fmt.Errorf("msm_batch_cuda_bls12_381 returned error code: %d", ret) + } + + return out, nil +} diff --git a/goicicle/curves/bls12381/ntt.go b/goicicle/curves/bls12381/ntt.go new file mode 100644 index 000000000..ab3107cf8 --- /dev/null +++ b/goicicle/curves/bls12381/ntt.go @@ -0,0 +1,73 @@ + + // Copyright 2023 Ingonyama + // + // Licensed under the Apache License, Version 2.0 (the "License"); + // you may not use this file except in compliance with the License. + // You may obtain a copy of the License at + // + // http://www.apache.org/licenses/LICENSE-2.0 + // + // Unless required by applicable law or agreed to in writing, software + // distributed under the License is distributed on an "AS IS" BASIS, + // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + // See the License for the specific language governing permissions and + // limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +package bls12381 + + +// #cgo CFLAGS: -I../../../icicle/curves/bls12_381 +// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn12_381 +// #include "ntt.h" +import "C" +import "unsafe" + +const ( + NONE = 0 + DIF = 1 + DIT = 2 +) + +func NttBLS12381(scalars *[]ScalarField, isInverse bool, decimation int, deviceId int) uint64 { + scalarsC := (*C.BLS12381_scalar_t)(unsafe.Pointer(&(*scalars)[0])) + + ret := C.ntt_cuda_bls12_381(scalarsC, C.uint32_t(len(*scalars)), C.bool(isInverse), C.size_t(decimation), C.size_t(deviceId)) + + return uint64(ret) +} + +func NttBatchBLS12381(scalars *[]ScalarField, isInverse bool, batchSize, deviceId int) uint64 { + scalarsC := (*C.BLS12381_scalar_t)(unsafe.Pointer(&(*scalars)[0])) + isInverseC := C.bool(isInverse) + batchSizeC := C.uint32_t(batchSize) + deviceIdC := C.size_t(deviceId) + + ret := C.ntt_batch_cuda_bls12_381(scalarsC, C.uint32_t(len(*scalars)), batchSizeC, isInverseC, deviceIdC) + + return uint64(ret) +} + +func EcNttBLS12381(values *[]PointBLS12381, isInverse bool, deviceId int) uint64 { + valuesC := (*C.BLS12381_projective_t)(unsafe.Pointer(&(*values)[0])) + deviceIdC := C.size_t(deviceId) + isInverseC := C.bool(isInverse) + n := C.uint32_t(len(*values)) + + ret := C.ecntt_cuda_bls12_381(valuesC, n, isInverseC, deviceIdC) + + return uint64(ret) +} + +func EcNttBatchBLS12381(values *[]PointBLS12381, isInverse bool, batchSize, deviceId int) uint64 { + valuesC := (*C.BLS12381_projective_t)(unsafe.Pointer(&(*values)[0])) + deviceIdC := C.size_t(deviceId) + isInverseC := C.bool(isInverse) + n := C.uint32_t(len(*values)) + batchSizeC := C.uint32_t(batchSize) + + ret := C.ecntt_batch_cuda_bls12_381(valuesC, n, batchSizeC, isInverseC, deviceIdC) + + return uint64(ret) +} diff --git a/goicicle/curves/bls12381/utils.go b/goicicle/curves/bls12381/utils.go new file mode 100644 index 000000000..d24af1af8 --- /dev/null +++ b/goicicle/curves/bls12381/utils.go @@ -0,0 +1,25 @@ +package bls12381 + +import "encoding/binary" + +// Function to convert [8]uint32 to [4]uint64 +func ConvertUint32ArrToUint64Arr(arr32 [8]uint32) [4]uint64 { + var arr64 [4]uint64 + for i := 0; i < len(arr32); i += 2 { + arr64[i/2] = (uint64(arr32[i]) << 32) | uint64(arr32[i+1]) + } + return arr64 +} + +func ConvertUint64ArrToUint32Arr(arr64 [4]uint64) [8]uint32 { + var arr32 [8]uint32 + for i, v := range arr64 { + b := make([]byte, 8) + binary.LittleEndian.PutUint64(b, v) + + arr32[i*2] = binary.LittleEndian.Uint32(b[0:4]) + arr32[i*2+1] = binary.LittleEndian.Uint32(b[4:8]) + } + + return arr32 +} diff --git a/goicicle/curves/bls12381/utils_test.go b/goicicle/curves/bls12381/utils_test.go new file mode 100644 index 000000000..762405800 --- /dev/null +++ b/goicicle/curves/bls12381/utils_test.go @@ -0,0 +1,81 @@ +package bls12381 + +import ( + "testing" +) + +func TestConvertUint32ArrToUint64Arr(t *testing.T) { + testCases := []struct { + name string + input [8]uint32 + want [4]uint64 + }{ + { + name: "Test with incremental array", + input: [8]uint32{1, 2, 3, 4, 5, 6, 7, 8}, + want: [4]uint64{4294967298, 12884901892, 21474836486, 30064771080}, + }, + { + name: "Test with all zeros", + input: [8]uint32{0, 0, 0, 0, 0, 0, 0, 0}, + want: [4]uint64{0, 0, 0, 0}, + }, + { + name: "Test with maximum uint32 values", + input: [8]uint32{4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295}, + want: [4]uint64{18446744073709551615, 18446744073709551615, 18446744073709551615, 18446744073709551615}, + }, + { + name: "Test with alternating min and max uint32 values", + input: [8]uint32{0, 4294967295, 0, 4294967295, 0, 4294967295, 0, 4294967295}, + want: [4]uint64{4294967295, 4294967295, 4294967295, 4294967295}, + }, + { + name: "Test with alternating max and min uint32 values", + input: [8]uint32{4294967295, 0, 4294967295, 0, 4294967295, 0, 4294967295, 0}, + want: [4]uint64{18446744069414584320, 18446744069414584320, 18446744069414584320, 18446744069414584320}, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + got := ConvertUint32ArrToUint64Arr(tc.input) + if got != tc.want { + t.Errorf("got %v, want %v", got, tc.want) + } + }) + } +} + +func TestConvertUint64ArrToUint32Arr(t *testing.T) { + testCases := []struct { + name string + input [4]uint64 + expected [8]uint32 + }{ + { + name: "test one", + input: [4]uint64{1, 2, 3, 4}, + expected: [8]uint32{1, 0, 2, 0, 3, 0, 4, 0}, + }, + { + name: "test two", + input: [4]uint64{100, 200, 300, 400}, + expected: [8]uint32{100, 0, 200, 0, 300, 0, 400, 0}, + }, + { + name: "test three", + input: [4]uint64{1000, 2000, 3000, 4000}, + expected: [8]uint32{1000, 0, 2000, 0, 3000, 0, 4000, 0}, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + got := ConvertUint64ArrToUint32Arr(tc.input) + if got != tc.expected { + t.Errorf("got %v, want %v", got, tc.expected) + } + }) + } +} diff --git a/goicicle/curves/bn254/g1.go b/goicicle/curves/bn254/g1.go new file mode 100644 index 000000000..b5c560db1 --- /dev/null +++ b/goicicle/curves/bn254/g1.go @@ -0,0 +1,503 @@ +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +package bn254 + +import ( + "unsafe" + + "encoding/binary" + "fmt" + + "github.com/consensys/gnark-crypto/ecc/bn254" + + "github.com/consensys/gnark-crypto/ecc/bn254/fp" + + "github.com/consensys/gnark-crypto/ecc/bn254/fr" +) + +// #cgo CFLAGS: -I${SRCDIR}/icicle/curves/bn254/ +// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn254 +// #include "c_api.h" +// #include "ve_mod_mult.h" +import "C" + +const SCALAR_SIZE = 8 +const BASE_SIZE = 8 + +type ScalarField struct { + s [SCALAR_SIZE]uint32 +} + +type BaseField struct { + s [BASE_SIZE]uint32 +} + +type Field interface { + toGnarkFr() *fr.Element +} + +/* + * Common Constrctors + */ + +func NewFieldZero[T BaseField | ScalarField]() *T { + var field T + + return &field +} + +func NewFieldFromFrGnark[T BaseField | ScalarField](element fr.Element) *T { + s := ConvertUint64ArrToUint32Arr(element.Bits()) // get non-montgomry + + return &T{s} +} + +func NewFieldFromFpGnark[T BaseField | ScalarField](element fp.Element) *T { + s := ConvertUint64ArrToUint32Arr(element.Bits()) // get non-montgomry + + return &T{s} +} + +/* + * BaseField Constrctors + */ + +func NewBaseFieldOne() *BaseField { + var s [BASE_SIZE]uint32 + + s[0] = 1 + + return &BaseField{s} +} + +func BaseFieldFromLimbs(limbs [BASE_SIZE]uint32) *BaseField { + bf := NewFieldZero[BaseField]() + copy(bf.s[:], limbs[:]) + + return bf +} + +/* + * BaseField methods + */ + +func (f *BaseField) limbs() [BASE_SIZE]uint32 { + return f.s +} + +func (f *BaseField) toBytesLe() []byte { + bytes := make([]byte, len(f.s)*4) + for i, v := range f.s { + binary.LittleEndian.PutUint32(bytes[i*4:], v) + } + + return bytes +} + +func (f *BaseField) toGnarkFr() *fr.Element { + fb := f.toBytesLe() + var b32 [32]byte + copy(b32[:], fb[:32]) + + v, e := fr.LittleEndian.Element(&b32) + + if e != nil { + panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e)) + } + + return &v +} + +func (f *BaseField) toGnarkFp() *fp.Element { + fb := f.toBytesLe() + var b32 [32]byte + copy(b32[:], fb[:32]) + + v, e := fp.LittleEndian.Element(&b32) + + if e != nil { + panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e)) + } + + return &v +} + +/* + * ScalarField methods + */ + +func NewScalarFieldOne() *ScalarField { + var s [SCALAR_SIZE]uint32 + + s[0] = 1 + + return &ScalarField{s} +} + +func (a *ScalarField) Equals(b *ScalarField) bool { + for i, v := range a.s { + if b.s[i] != v { + return false + } + } + return true +} + +/* + * ScalarField methods + */ + +func (f *ScalarField) limbs() [SCALAR_SIZE]uint32 { + return f.s +} + +func (f *ScalarField) toBytesLe() []byte { + bytes := make([]byte, len(f.s)*4) + for i, v := range f.s { + binary.LittleEndian.PutUint32(bytes[i*4:], v) + } + + return bytes +} + +func (f ScalarField) toGnarkFr() *fr.Element { + fb := f.toBytesLe() + var b32 [32]byte + copy(b32[:], fb[:32]) + + v, e := fr.LittleEndian.Element(&b32) + + if e != nil { + panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e)) + } + + return &v +} + +func (f *ScalarField) toGnarkFp() *fp.Element { + fb := f.toBytesLe() + var b32 [32]byte + copy(b32[:], fb[:32]) + + v, e := fp.LittleEndian.Element(&b32) + + if e != nil { + panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e)) + } + + return &v +} + +/* + * PointBN254 + */ + +type PointBN254 struct { + x, y, z BaseField +} + +func NewPointBN254Zero() *PointBN254 { + return &PointBN254{ + x: *NewFieldZero[BaseField](), + y: *NewBaseFieldOne(), + z: *NewFieldZero[BaseField](), + } +} + +func (p *PointBN254) eq(pCompare *PointBN254) bool { + // Cast *PointBN254 to *C.BN254_projective_t + // The unsafe.Pointer cast is necessary because Go doesn't allow direct casts + // between different pointer types. + // It's your responsibility to ensure that the types are compatible. + pC := (*C.BN254_projective_t)(unsafe.Pointer(p)) + pCompareC := (*C.BN254_projective_t)(unsafe.Pointer(pCompare)) + + // Call the C function + // The C function doesn't keep any references to the data, + // so it's fine if the Go garbage collector moves or deletes the data later. + return bool(C.eq_bn254(pC, pCompareC)) +} + +func (p *PointBN254) strip_z() *PointAffineNoInfinityBN254 { + return &PointAffineNoInfinityBN254{ + x: p.x, + y: p.y, + } +} + +func (p *PointBN254) toGnarkAffine() *bn254.G1Affine { + px := p.x.toGnarkFp() + py := p.y.toGnarkFp() + pz := p.z.toGnarkFp() + + zInv := new(fp.Element) + x := new(fp.Element) + y := new(fp.Element) + + zInv.Inverse(pz) + + x.Mul(px, zInv) + y.Mul(py, zInv) + + return &bn254.G1Affine{X: *x, Y: *y} +} + +func (p *PointBN254) ToGnarkJac() *bn254.G1Jac { + var p1 bn254.G1Jac + p1.FromAffine(p.toGnarkAffine()) + + return &p1 +} + +func PointBN254FromG1AffineGnark(gnark *bn254.G1Affine) *PointBN254 { + point := PointBN254{ + x: *NewFieldFromFpGnark[BaseField](gnark.X), + y: *NewFieldFromFpGnark[BaseField](gnark.Y), + z: *NewBaseFieldOne(), + } + + return &point +} + +// converts jac fromat to projective +func PointBN254FromJacGnark(gnark *bn254.G1Jac) *PointBN254 { + var pointAffine bn254.G1Affine + pointAffine.FromJacobian(gnark) + + point := PointBN254{ + x: *NewFieldFromFpGnark[BaseField](pointAffine.X), + y: *NewFieldFromFpGnark[BaseField](pointAffine.Y), + z: *NewBaseFieldOne(), + } + + return &point +} + +func PointBN254fromLimbs(x, y, z *[]uint32) *PointBN254 { + return &PointBN254{ + x: *BaseFieldFromLimbs(getFixedLimbs(x)), + y: *BaseFieldFromLimbs(getFixedLimbs(y)), + z: *BaseFieldFromLimbs(getFixedLimbs(z)), + } +} + +/* + * PointAffineNoInfinityBN254 + */ + +type PointAffineNoInfinityBN254 struct { + x, y BaseField +} + +func NewPointAffineNoInfinityBN254Zero() *PointAffineNoInfinityBN254 { + return &PointAffineNoInfinityBN254{ + x: *NewFieldZero[BaseField](), + y: *NewFieldZero[BaseField](), + } +} + +func (p *PointAffineNoInfinityBN254) toProjective() *PointBN254 { + return &PointBN254{ + x: p.x, + y: p.y, + z: *NewBaseFieldOne(), + } +} + +func (p *PointAffineNoInfinityBN254) toGnarkAffine() *bn254.G1Affine { + return p.toProjective().toGnarkAffine() +} + +func PointAffineNoInfinityBN254FromLimbs(x, y *[]uint32) *PointAffineNoInfinityBN254 { + return &PointAffineNoInfinityBN254{ + x: *BaseFieldFromLimbs(getFixedLimbs(x)), + y: *BaseFieldFromLimbs(getFixedLimbs(y)), + } +} + +/* + * Multiplication + */ + +func MultiplyVec(a []PointBN254, b []ScalarField, deviceID int) { + if len(a) != len(b) { + panic("a and b have different lengths") + } + + pointsC := (*C.BN254_projective_t)(unsafe.Pointer(&a[0])) + scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&b[0])) + deviceIdC := C.size_t(deviceID) + nElementsC := C.size_t(len(a)) + + C.vec_mod_mult_point_bn254(pointsC, scalarsC, nElementsC, deviceIdC) +} + +func MultiplyScalar(a []ScalarField, b []ScalarField, deviceID int) { + if len(a) != len(b) { + panic("a and b have different lengths") + } + + aC := (*C.BN254_scalar_t)(unsafe.Pointer(&a[0])) + bC := (*C.BN254_scalar_t)(unsafe.Pointer(&b[0])) + deviceIdC := C.size_t(deviceID) + nElementsC := C.size_t(len(a)) + + C.vec_mod_mult_scalar_bn254(aC, bC, nElementsC, deviceIdC) +} + +// Multiply a matrix by a scalar: +// +// `a` - flattenned matrix; +// `b` - vector to multiply `a` by; +func MultiplyMatrix(a []ScalarField, b []ScalarField, deviceID int) { + c := make([]ScalarField, len(b)) + for i := range c { + c[i] = *NewFieldZero[ScalarField]() + } + + aC := (*C.BN254_scalar_t)(unsafe.Pointer(&a[0])) + bC := (*C.BN254_scalar_t)(unsafe.Pointer(&b[0])) + cC := (*C.BN254_scalar_t)(unsafe.Pointer(&c[0])) + deviceIdC := C.size_t(deviceID) + nElementsC := C.size_t(len(a)) + + C.matrix_vec_mod_mult_bn254(aC, bC, cC, nElementsC, deviceIdC) +} + +/* + * Utils + */ + +func getFixedLimbs(slice *[]uint32) [BASE_SIZE]uint32 { + if len(*slice) <= BASE_SIZE { + limbs := [BASE_SIZE]uint32{} + copy(limbs[:len(*slice)], *slice) + return limbs + } + + panic("slice has too many elements") +} + +func BatchConvertFromFrGnark[T BaseField | ScalarField](elements []fr.Element) []T { + var newElements []T + for _, e := range elements { + converted := NewFieldFromFrGnark[T](e) + newElements = append(newElements, *converted) + } + + return newElements +} + +func BatchConvertFromFrGnarkThreaded[T BaseField | ScalarField](elements []fr.Element, routines int) []T { + var newElements []T + + if routines > 1 && routines <= len(elements) { + channels := make([]chan []T, routines) + for i := 0; i < routines; i++ { + channels[i] = make(chan []T, 1) + } + + convert := func(elements []fr.Element, chanIndex int) { + var convertedElements []T + for _, e := range elements { + converted := NewFieldFromFrGnark[T](e) + convertedElements = append(convertedElements, *converted) + } + + channels[chanIndex] <- convertedElements + } + + batchLen := len(elements) / routines + for i := 0; i < routines; i++ { + start := batchLen * i + end := batchLen * (i + 1) + elemsToConv := elements[start:end] + if i == routines-1 { + elemsToConv = elements[start:] + } + go convert(elemsToConv, i) + } + + for i := 0; i < routines; i++ { + newElements = append(newElements, <-channels[i]...) + } + } else { + for _, e := range elements { + converted := NewFieldFromFrGnark[T](e) + newElements = append(newElements, *converted) + } + } + + return newElements +} + +func BatchConvertToFrGnark[T Field](elements []T) []fr.Element { + var newElements []fr.Element + for _, e := range elements { + converted := e.toGnarkFr() + newElements = append(newElements, *converted) + } + + return newElements +} + +func BatchConvertToFrGnarkThreaded[T Field](elements []T, routines int) []fr.Element { + var newElements []fr.Element + + if routines > 1 { + channels := make([]chan []fr.Element, routines) + for i := 0; i < routines; i++ { + channels[i] = make(chan []fr.Element, 1) + } + + convert := func(elements []T, chanIndex int) { + var convertedElements []fr.Element + for _, e := range elements { + converted := e.toGnarkFr() + convertedElements = append(convertedElements, *converted) + } + + channels[chanIndex] <- convertedElements + } + + batchLen := len(elements) / routines + for i := 0; i < routines; i++ { + elemsToConv := elements[batchLen*i : batchLen*(i+1)] + go convert(elemsToConv, i) + } + + for i := 0; i < routines; i++ { + newElements = append(newElements, <-channels[i]...) + } + } else { + for _, e := range elements { + converted := e.toGnarkFr() + newElements = append(newElements, *converted) + } + } + + return newElements +} + +func BatchConvertFromG1Affine(elements []bn254.G1Affine) []PointAffineNoInfinityBN254 { + var newElements []PointAffineNoInfinityBN254 + for _, e := range elements { + newElement := PointBN254FromG1AffineGnark(&e).strip_z() + newElements = append(newElements, *newElement) + } + return newElements +} diff --git a/goicicle/curves/bn254/g1_test.go b/goicicle/curves/bn254/g1_test.go new file mode 100644 index 000000000..ba416abbd --- /dev/null +++ b/goicicle/curves/bn254/g1_test.go @@ -0,0 +1,229 @@ +package bn254 + +import ( + "encoding/binary" + "fmt" + "testing" + + "github.com/consensys/gnark-crypto/ecc/bn254" + "github.com/consensys/gnark-crypto/ecc/bn254/fp" + "github.com/consensys/gnark-crypto/ecc/bn254/fr" + "github.com/stretchr/testify/assert" +) + +func TestNewFieldBN254One(t *testing.T) { + oneField := NewBaseFieldOne() + rawOneField := [8]uint32([8]uint32{0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}) + + assert.Equal(t, oneField.s, rawOneField) +} + +func TestNewFieldBN254Zero(t *testing.T) { + zeroField := NewFieldZero[BaseField]() + rawZeroField := [8]uint32([8]uint32{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}) + + assert.Equal(t, zeroField.s, rawZeroField) +} + +func TestFieldBN254FromGnark(t *testing.T) { + var rand fr.Element + rand.SetRandom() + + f := NewFieldFromFrGnark[ScalarField](rand) + + assert.Equal(t, f.s, ConvertUint64ArrToUint32Arr(rand.Bits())) +} + +func BenchmarkBatchConvertFromFrGnarkThreaded(b *testing.B) { + // ROUTINES := []int{4,5,6,7,8} + + // for _, routineAmount := range ROUTINES { + routineAmount := 7 + _, scalars_fr := GenerateScalars(1 << 24) + b.Run(fmt.Sprintf("Convert %d", routineAmount), func(b *testing.B) { + for n := 0; n < b.N; n++ { + _ = BatchConvertFromFrGnarkThreaded[ScalarField](scalars_fr, routineAmount) + } + }) + // } +} + +func BenchmarkBatchConvertFromFrGnark(b *testing.B) { + _, scalars_fr := GenerateScalars(1 << 24) + b.Run("BatchConvert 2^24", func(b *testing.B) { + for n := 0; n < b.N; n++ { + _ = BatchConvertFromFrGnark[ScalarField](scalars_fr) + } + }) +} + +func TestFieldBN254ToBytesLe(t *testing.T) { + var rand fr.Element + rand.SetRandom() + + f := NewFieldFromFrGnark[ScalarField](rand) + + expected := make([]byte, len(f.s)*4) // each uint32 takes 4 bytes + for i, v := range f.s { + binary.LittleEndian.PutUint32(expected[i*4:], v) + } + + assert.Equal(t, f.toBytesLe(), expected) + assert.Equal(t, len(f.toBytesLe()), 32) +} + +func TestNewPointBN254Zero(t *testing.T) { + point := NewPointBN254Zero() + a := new(PointBN254) + a.ToGnarkJac() + + assert.Equal(t, point.x, *NewFieldZero[BaseField]()) + assert.Equal(t, point.y, *NewBaseFieldOne()) + assert.Equal(t, point.z, *NewFieldZero[BaseField]()) +} + +func TestBN254Eq(t *testing.T) { + p1 := NewPointBN254Zero() + p2 := NewPointBN254Zero() + p3 := &PointBN254{ + x: *NewBaseFieldOne(), + y: *NewBaseFieldOne(), + z: *NewBaseFieldOne(), + } + + assert.Equal(t, p1.eq(p2), true) + assert.Equal(t, p1.eq(p3), false) +} + +func TestBN254StripZ(t *testing.T) { + p1 := NewPointBN254Zero() + p2ZLess := p1.strip_z() + + assert.IsType(t, PointAffineNoInfinityBN254{}, *p2ZLess) + assert.Equal(t, p1.x, p2ZLess.x) + assert.Equal(t, p1.y, p2ZLess.y) +} + +func TestPointBN254FromGnark(t *testing.T) { + gnarkP, _ := randG1Jac() + + p := PointBN254FromJacGnark(&gnarkP) + + z_inv := new(fp.Element) + z_invsq := new(fp.Element) + z_invq3 := new(fp.Element) + x := new(fp.Element) + y := new(fp.Element) + + z_inv.Inverse(&gnarkP.Z) + z_invsq.Mul(z_inv, z_inv) + z_invq3.Mul(z_invsq, z_inv) + + x.Mul(&gnarkP.X, z_invsq) + y.Mul(&gnarkP.Y, z_invq3) + + assert.Equal(t, p.x, *NewFieldFromFpGnark[BaseField](*x)) + assert.Equal(t, p.y, *NewFieldFromFpGnark[BaseField](*y)) + assert.Equal(t, p.z, *NewBaseFieldOne()) +} + +func TestPointBN254fromLimbs(t *testing.T) { + gnarkP, _ := randG1Jac() + p := PointBN254FromJacGnark(&gnarkP) + + x := p.x.limbs() + y := p.y.limbs() + z := p.z.limbs() + + xSlice := x[:] + ySlice := y[:] + zSlice := z[:] + + pFromLimbs := PointBN254fromLimbs(&xSlice, &ySlice, &zSlice) + + assert.Equal(t, pFromLimbs, p) +} + +func TestNewPointAffineNoInfinityBN254Zero(t *testing.T) { + zeroP := NewPointAffineNoInfinityBN254Zero() + + assert.Equal(t, zeroP.x, *NewFieldZero[BaseField]()) + assert.Equal(t, zeroP.y, *NewFieldZero[BaseField]()) +} + +func TestPointAffineNoInfinityBN254ToProjective(t *testing.T) { + gnarkP, _ := randG1Jac() + affine := PointBN254FromJacGnark(&gnarkP).strip_z() + proj := affine.toProjective() + + assert.Equal(t, proj.x, affine.x) + assert.Equal(t, proj.x, affine.x) + assert.Equal(t, proj.z, *NewBaseFieldOne()) +} + +func TestPointAffineNoInfinityBN254FromLimbs(t *testing.T) { + // Initialize your test values + x := []uint32{1, 2, 3, 4, 5, 6, 7, 8} + y := []uint32{9, 10, 11, 12, 13, 14, 15, 16} + + // Execute your function + result := PointAffineNoInfinityBN254FromLimbs(&x, &y) + + // Define your expected result + expected := &PointAffineNoInfinityBN254{ + x: *BaseFieldFromLimbs(getFixedLimbs(&x)), + y: *BaseFieldFromLimbs(getFixedLimbs(&y)), + } + + // Test if result is as expected + assert.Equal(t, result, expected) +} + +func TestToGnarkAffine(t *testing.T) { + gJac, _ := randG1Jac() + proj := PointBN254FromJacGnark(&gJac) + + var gAffine bn254.G1Affine + gAffine.FromJacobian(&gJac) + + affine := *proj.toGnarkAffine() + assert.Equal(t, affine, gAffine) +} + +func TestGetFixedLimbs(t *testing.T) { + t.Run("case of valid input of length less than 8", func(t *testing.T) { + slice := []uint32{1, 2, 3, 4, 5, 6, 7} + expected := [8]uint32{1, 2, 3, 4, 5, 6, 7, 0} + + result := getFixedLimbs(&slice) + assert.Equal(t, result, expected) + }) + + t.Run("case of valid input of length 8", func(t *testing.T) { + slice := []uint32{1, 2, 3, 4, 5, 6, 7, 8} + expected := [8]uint32{1, 2, 3, 4, 5, 6, 7, 8} + + result := getFixedLimbs(&slice) + assert.Equal(t, result, expected) + }) + + t.Run("case of empty input", func(t *testing.T) { + slice := []uint32{} + expected := [8]uint32{0, 0, 0, 0, 0, 0, 0, 0} + + result := getFixedLimbs(&slice) + assert.Equal(t, result, expected) + }) + + t.Run("case of input length greater than 8", func(t *testing.T) { + slice := []uint32{1, 2, 3, 4, 5, 6, 7, 8, 9} + + defer func() { + if r := recover(); r == nil { + t.Errorf("the code did not panic") + } + }() + + getFixedLimbs(&slice) + }) +} diff --git a/goicicle/curves/bn254/g2.go b/goicicle/curves/bn254/g2.go new file mode 100644 index 000000000..63a32d5a0 --- /dev/null +++ b/goicicle/curves/bn254/g2.go @@ -0,0 +1,235 @@ +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +package bn254 + +import ( + "encoding/binary" + "errors" + "fmt" + "unsafe" + + "github.com/consensys/gnark-crypto/ecc/bn254" + "github.com/consensys/gnark-crypto/ecc/bn254/fp" +) + +// #cgo CFLAGS: -I${SRCDIR}/icicle/curves/bn254/ +// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn254 +// #include "c_api.h" +// #include "ve_mod_mult.h" +import "C" + +func BatchConvertFromG2Affine(elements []bn254.G2Affine) []G2PointAffine { + var newElements []G2PointAffine + for _, gg2Affine := range elements { + var newElement G2PointAffine + newElement.FromGnarkAffine(&gg2Affine) + + newElements = append(newElements, newElement) + } + return newElements +} + +func BatchConvertFromG2AffineThreads(elements []bn254.G2Affine, routines int) []G2PointAffine { + var newElements []G2PointAffine + + if routines > 1 && routines <= len(elements) { + channels := make([]chan []G2PointAffine, routines) + for i := 0; i < routines; i++ { + channels[i] = make(chan []G2PointAffine, 1) + } + + convert := func(elements []bn254.G2Affine, chanIndex int) { + var convertedElements []G2PointAffine + for _, e := range elements { + var converted G2PointAffine + converted.FromGnarkAffine(&e) + convertedElements = append(convertedElements, converted) + } + + channels[chanIndex] <- convertedElements + } + + batchLen := len(elements) / routines + for i := 0; i < routines; i++ { + start := batchLen * i + end := batchLen * (i + 1) + elemsToConv := elements[start:end] + if i == routines-1 { + elemsToConv = elements[start:] + } + go convert(elemsToConv, i) + } + + for i := 0; i < routines; i++ { + newElements = append(newElements, <-channels[i]...) + } + } else { + for _, e := range elements { + var converted G2PointAffine + converted.FromGnarkAffine(&e) + newElements = append(newElements, converted) + } + } + + return newElements +} + +// G2 extension field + +type G2Element [4]uint64 + +type ExtentionField struct { + A0, A1 G2Element +} + +type G2PointAffine struct { + x, y ExtentionField +} + +type G2Point struct { + x, y, z ExtentionField +} + +func (p *G2Point) eqg2(pCompare *G2Point) bool { + // Cast *PointBN254 to *C.BN254_projective_t + // The unsafe.Pointer cast is necessary because Go doesn't allow direct casts + // between different pointer types. + // It's your responsibility to ensure that the types are compatible. + pC := (*C.BN254_g2_projective_t)(unsafe.Pointer(p)) + pCompareC := (*C.BN254_g2_projective_t)(unsafe.Pointer(pCompare)) + + // Call the C function + // The C function doesn't keep any references to the data, + // so it's fine if the Go garbage collector moves or deletes the data later. + return bool(C.eq_g2_bn254(pC, pCompareC)) +} + +func (f *G2Element) toBytesLe() []byte { + var bytes []byte + for _, val := range f { + buf := make([]byte, 8) // 8 bytes because uint64 is 64-bit + binary.LittleEndian.PutUint64(buf, val) + bytes = append(bytes, buf...) + } + return bytes +} + +/* +TODO: the following functions are due to a bug in the cuda code, +these fucntions should be deleted once cuda MsmG2 returns non montgomery format +*/ +const ( + q0 uint64 = 4332616871279656263 + q1 uint64 = 10917124144477883021 + q2 uint64 = 13281191951274694749 + q3 uint64 = 3486998266802970665 +) + +func smallerThanModulus(z fp.Element) bool { + return (z[3] < q3 || (z[3] == q3 && (z[2] < q2 || (z[2] == q2 && (z[1] < q1 || (z[1] == q1 && (z[0] < q0))))))) +} + +func ElementWithOutConvertingToMontgomery(b *[32]byte) (fp.Element, error) { + var z fp.Element + z[0] = binary.LittleEndian.Uint64((*b)[0:8]) + z[1] = binary.LittleEndian.Uint64((*b)[8:16]) + z[2] = binary.LittleEndian.Uint64((*b)[16:24]) + z[3] = binary.LittleEndian.Uint64((*b)[24:32]) + + if !smallerThanModulus(z) { + return fp.Element{}, errors.New("invalid fp.Element encoding") + } + + return z, nil +} + +func (f *G2Element) toGnarkFp() *fp.Element { + fb := f.toBytesLe() + var b32 [32]byte + copy(b32[:], fb[:32]) + + v, e := ElementWithOutConvertingToMontgomery(&b32) // cuda returns montgomery format + //v2, e := fp.LittleEndian.Element(&b32) // TODO: revert back to this once cuda code is fixed. + + if e != nil { + panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e)) + } + + return &v +} + +func (f *ExtentionField) toGnarkE2() bn254.E2 { + return bn254.E2{ + A0: *f.A0.toGnarkFp(), + A1: *f.A1.toGnarkFp(), + } +} + +func (p *G2Point) ToGnarkJac() *bn254.G2Jac { + x := p.x.toGnarkE2() + y := p.y.toGnarkE2() + z := p.z.toGnarkE2() + + var zSquared bn254.E2 + zSquared.Mul(&z, &z) + + var X bn254.E2 + X.Mul(&x, &z) + + var Y bn254.E2 + Y.Mul(&y, &zSquared) + + after := bn254.G2Jac{ + X: X, + Y: Y, + Z: z, + } + + return &after +} + +func (p *G2PointAffine) ToProjective() G2Point { + return G2Point{ + x: p.x, + y: p.y, + z: ExtentionField{ + A0: G2Element{1, 0, 0, 0}, + A1: G2Element{0, 0, 0, 0}, + }, + } +} + +func (g *G2PointAffine) FromGnarkAffine(gnark *bn254.G2Affine) *G2PointAffine { + g.x.A0 = gnark.X.A0.Bits() + g.x.A1 = gnark.X.A1.Bits() + g.y.A0 = gnark.Y.A0.Bits() + g.y.A1 = gnark.Y.A1.Bits() + + return g +} + +func (g *G2PointAffine) FromGnarkJac(gnark *bn254.G2Jac) *G2PointAffine { + var pointAffine bn254.G2Affine + pointAffine.FromJacobian(gnark) + + g.x.A0 = pointAffine.X.A0.Bits() + g.x.A1 = pointAffine.X.A1.Bits() + g.y.A0 = pointAffine.Y.A0.Bits() + g.y.A1 = pointAffine.Y.A1.Bits() + + return g +} diff --git a/goicicle/curves/bn254/g2_test.go b/goicicle/curves/bn254/g2_test.go new file mode 100644 index 000000000..1d8233ec1 --- /dev/null +++ b/goicicle/curves/bn254/g2_test.go @@ -0,0 +1,18 @@ +package bn254 + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestToGnarkJacG2(t *testing.T) { + gnark, _ := randG2Jac() + + var pointAffine G2PointAffine + pointAffine.FromGnarkJac(&gnark) + pointProjective := pointAffine.ToProjective() + backToGnark := pointProjective.ToGnarkJac() + + assert.True(t, gnark.Equal(backToGnark)) +} diff --git a/goicicle/curves/bn254/msm.go b/goicicle/curves/bn254/msm.go new file mode 100644 index 000000000..d7a88d81e --- /dev/null +++ b/goicicle/curves/bn254/msm.go @@ -0,0 +1,187 @@ +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +package bn254 + +import ( + "errors" + "fmt" + "unsafe" +) + +// #cgo CFLAGS: -I../../../icicle/curves/bn254/ +// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn254 +// #include "msm.h" +import "C" + +func MsmBN254(out *PointBN254, points []PointAffineNoInfinityBN254, scalars []ScalarField, device_id int) (*PointBN254, error) { + if len(points) != len(scalars) { + return nil, errors.New("error on: len(points) != len(scalars)") + } + + pointsC := (*C.BN254_affine_t)(unsafe.Pointer(&points[0])) + scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&scalars[0])) + outC := (*C.BN254_projective_t)(unsafe.Pointer(out)) + ret := C.msm_cuda_bn254(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id)) + + if ret != 0 { + return nil, fmt.Errorf("msm_cuda_bn254 returned error code: %d", ret) + } + + return out, nil +} + +func MsmG2BatchBN254(points *[]G2PointAffine, scalars *[]ScalarField, batchSize, deviceId int) ([]*G2Point, error) { + // Check for nil pointers + if points == nil || scalars == nil { + return nil, errors.New("points or scalars is nil") + } + + if len(*points) != len(*scalars) { + return nil, errors.New("error on: len(points) != len(scalars)") + } + + // Check for empty slices + if len(*points) == 0 || len(*scalars) == 0 { + return nil, errors.New("points or scalars is empty") + } + + // Check for zero batchSize + if batchSize <= 0 { + return nil, errors.New("error on: batchSize must be greater than zero") + } + + out := make([]*G2Point, batchSize) + + outC := (*C.BN254_g2_projective_t)(unsafe.Pointer(&out[0])) + pointsC := (*C.BN254_g2_affine_t)(unsafe.Pointer(&(*points)[0])) + scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&(*scalars)[0])) + msmSizeC := C.size_t(len(*points) / batchSize) + deviceIdC := C.size_t(deviceId) + batchSizeC := C.size_t(batchSize) + + ret := C.msm_batch_g2_cuda_bn254(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC) + if ret != 0 { + return nil, fmt.Errorf("msm_batch_cuda_bn254 returned error code: %d", ret) + } + + return out, nil +} + +func MsmG2BN254(out *G2Point, points []G2PointAffine, scalars []ScalarField, device_id int) (*G2Point, error) { + if len(points) != len(scalars) { + return nil, errors.New("error on: len(points) != len(scalars)") + } + + pointsC := (*C.BN254_g2_affine_t)(unsafe.Pointer(&points[0])) + scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&scalars[0])) + outC := (*C.BN254_g2_projective_t)(unsafe.Pointer(out)) + + ret := C.msm_g2_cuda_bn254(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id)) + + if ret != 0 { + return nil, fmt.Errorf("msm_g2_cuda_bn254 returned error code: %d", ret) + } + + return out, nil +} + +func CommitG2(d_out, d_scalars, d_points unsafe.Pointer, count int) int { + d_outC := (*C.BN254_g2_projective_t)(d_out) + scalarsC := (*C.BN254_scalar_t)(d_scalars) + pointsC := (*C.BN254_g2_affine_t)(d_points) + countC := (C.size_t)(count) + + ret := C.commit_g2_cuda_bn254(d_outC, scalarsC, pointsC, countC, 0) + + if ret != 0 { + return -1 + } + + return 0 +} + +func MsmBatchBN254(points *[]PointAffineNoInfinityBN254, scalars *[]ScalarField, batchSize, deviceId int) ([]*PointBN254, error) { + // Check for nil pointers + if points == nil || scalars == nil { + return nil, errors.New("points or scalars is nil") + } + + if len(*points) != len(*scalars) { + return nil, errors.New("error on: len(points) != len(scalars)") + } + + // Check for empty slices + if len(*points) == 0 || len(*scalars) == 0 { + return nil, errors.New("points or scalars is empty") + } + + // Check for zero batchSize + if batchSize <= 0 { + return nil, errors.New("error on: batchSize must be greater than zero") + } + + out := make([]*PointBN254, batchSize) + + for i := 0; i < len(out); i++ { + out[i] = NewPointBN254Zero() + } + + outC := (*C.BN254_projective_t)(unsafe.Pointer(&out[0])) + pointsC := (*C.BN254_affine_t)(unsafe.Pointer(&(*points)[0])) + scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&(*scalars)[0])) + msmSizeC := C.size_t(len(*points) / batchSize) + deviceIdC := C.size_t(deviceId) + batchSizeC := C.size_t(batchSize) + + ret := C.msm_batch_cuda_bn254(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC) + if ret != 0 { + return nil, fmt.Errorf("msm_batch_cuda_bn254 returned error code: %d", ret) + } + + return out, nil +} + +func Commit(d_out, d_scalars, d_points unsafe.Pointer, count int) int { + d_outC := (*C.BN254_projective_t)(d_out) + scalarsC := (*C.BN254_scalar_t)(d_scalars) + pointsC := (*C.BN254_affine_t)(d_points) + countC := (C.size_t)(count) + + ret := C.commit_cuda_bn254(d_outC, scalarsC, pointsC, countC, 0) + + if ret != 0 { + return -1 + } + + return 0 +} + +func CommitBatch(d_out, d_scalars, d_points unsafe.Pointer, count, batch_size int) int { + d_outC := (*C.BN254_projective_t)(d_out) + scalarsC := (*C.BN254_scalar_t)(d_scalars) + pointsC := (*C.BN254_affine_t)(d_points) + countC := (C.size_t)(count) + batch_sizeC := (C.size_t)(batch_size) + + ret := C.commit_batch_cuda_bn254(d_outC, scalarsC, pointsC, countC, batch_sizeC, 0) + + if ret != 0 { + return -1 + } + + return 0 +} diff --git a/goicicle/curves/bn254/msm_test.go b/goicicle/curves/bn254/msm_test.go new file mode 100644 index 000000000..73d636ae0 --- /dev/null +++ b/goicicle/curves/bn254/msm_test.go @@ -0,0 +1,391 @@ +package bn254 + +import ( + "fmt" + "math" + "math/big" + "testing" + "time" + "unsafe" + + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bn254" + "github.com/consensys/gnark-crypto/ecc/bn254/fr" + "github.com/ingonyama-zk/icicle/goicicle" + "github.com/stretchr/testify/assert" +) + +func randG1Jac() (bn254.G1Jac, error) { + var point bn254.G1Jac + var scalar fr.Element + + _, err := scalar.SetRandom() + if err != nil { + return point, err + } + + genG1Jac, _, _, _ := bn254.Generators() + + //randomBigInt, err := rand.Int(rand.Reader, new(big.Int).Lsh(big.NewInt(1), 63)) + //randomBigInt, err := rand.Int(rand.Reader, big.NewInt(100)) + randomBigInt := big.NewInt(100) + + point.ScalarMultiplication(&genG1Jac, scalar.BigInt(randomBigInt)) + return point, nil +} + +func GeneratePoints(count int) ([]PointAffineNoInfinityBN254, []bn254.G1Affine) { + // Declare a slice of integers + var points []PointAffineNoInfinityBN254 + var pointsAffine []bn254.G1Affine + + // populate the slice + for i := 0; i < 10; i++ { + gnarkP, _ := randG1Jac() + var pointAffine bn254.G1Affine + pointAffine.FromJacobian(&gnarkP) + + p := PointBN254FromJacGnark(&gnarkP).strip_z() + + pointsAffine = append(pointsAffine, pointAffine) + points = append(points, *p) + } + + log2_10 := math.Log2(10) + log2Count := math.Log2(float64(count)) + log2Size := int(math.Ceil(log2Count - log2_10)) + + for i := 0; i < log2Size; i++ { + pointsAffine = append(pointsAffine, pointsAffine...) + points = append(points, points...) + } + + return points[:count], pointsAffine[:count] +} + +func GeneratePointsProj(count int) ([]PointBN254, []bn254.G1Jac) { + // Declare a slice of integers + var points []PointBN254 + var pointsAffine []bn254.G1Jac + + // Use a loop to populate the slice + for i := 0; i < count; i++ { + gnarkP, _ := randG1Jac() + p := PointBN254FromJacGnark(&gnarkP) + + pointsAffine = append(pointsAffine, gnarkP) + points = append(points, *p) + } + + return points, pointsAffine +} + +func GenerateScalars(count int) ([]ScalarField, []fr.Element) { + // Declare a slice of integers + var scalars []ScalarField + var scalars_fr []fr.Element + + var rand fr.Element + for i := 0; i < count; i++ { + rand.SetRandom() + s := NewFieldFromFrGnark[ScalarField](rand) + + scalars_fr = append(scalars_fr, rand) + scalars = append(scalars, *s) + } + + return scalars[:count], scalars_fr[:count] +} + +func TestMSM(t *testing.T) { + for _, v := range []int{24} { + count := 1 << v + + points, gnarkPoints := GeneratePoints(count) + fmt.Print("Finished generating points\n") + scalars, gnarkScalars := GenerateScalars(count) + fmt.Print("Finished generating scalars\n") + + out := new(PointBN254) + startTime := time.Now() + _, e := MsmBN254(out, points, scalars, 0) // non mont + fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds()) + + assert.Equal(t, e, nil, "error should be nil") + fmt.Print("Finished icicle MSM\n") + + var bn254AffineLib bn254.G1Affine + + gResult, _ := bn254AffineLib.MultiExp(gnarkPoints, gnarkScalars, ecc.MultiExpConfig{}) + fmt.Print("Finished Gnark MSM\n") + + assert.Equal(t, out.toGnarkAffine(), gResult) + } +} + +func TestCommitMSM(t *testing.T) { + for _, _ = range []int{24} { + count := 12_180_757 + // count := 1 << v - 1 + + points, gnarkPoints := GeneratePoints(count) + fmt.Print("Finished generating points\n") + scalars, gnarkScalars := GenerateScalars(count) + fmt.Print("Finished generating scalars\n") + + out_d, _ := goicicle.CudaMalloc(96) + + pointsBytes := count * 64 + points_d, _ := goicicle.CudaMalloc(pointsBytes) + goicicle.CudaMemCpyHtoD[PointAffineNoInfinityBN254](points_d, points, pointsBytes) + + scalarBytes := count * 32 + scalars_d, _ := goicicle.CudaMalloc(scalarBytes) + goicicle.CudaMemCpyHtoD[ScalarField](scalars_d, scalars, scalarBytes) + + startTime := time.Now() + e := Commit(out_d, scalars_d, points_d, count) + fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds()) + + outHost := make([]PointBN254, 1) + goicicle.CudaMemCpyDtoH[PointBN254](outHost, out_d, 96) + + assert.Equal(t, e, 0, "error should be 0") + fmt.Print("Finished icicle MSM\n") + + var bn254AffineLib bn254.G1Affine + + gResult, _ := bn254AffineLib.MultiExp(gnarkPoints, gnarkScalars, ecc.MultiExpConfig{}) + fmt.Print("Finished Gnark MSM\n") + + assert.Equal(t, outHost[0].toGnarkAffine(), gResult) + } +} + +func BenchmarkCommit(b *testing.B) { + LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26} + + for _, logMsmSize := range LOG_MSM_SIZES { + msmSize := 1 << logMsmSize + points, _ := GeneratePoints(msmSize) + scalars, _ := GenerateScalars(msmSize) + + out_d, _ := goicicle.CudaMalloc(96) + + pointsBytes := msmSize * 64 + points_d, _ := goicicle.CudaMalloc(pointsBytes) + goicicle.CudaMemCpyHtoD[PointAffineNoInfinityBN254](points_d, points, pointsBytes) + + scalarBytes := msmSize * 32 + scalars_d, _ := goicicle.CudaMalloc(scalarBytes) + goicicle.CudaMemCpyHtoD[ScalarField](scalars_d, scalars, scalarBytes) + + b.Run(fmt.Sprintf("MSM %d", logMsmSize), func(b *testing.B) { + for n := 0; n < b.N; n++ { + e := Commit(out_d, scalars_d, points_d, msmSize) + + if e != 0 { + panic("Error occured") + } + } + }) + } +} + +func TestBenchMSM(t *testing.T) { + for _, batchPow2 := range []int{2, 4} { + for _, pow2 := range []int{4, 6} { + msmSize := 1 << pow2 + batchSize := 1 << batchPow2 + count := msmSize * batchSize + + points, _ := GeneratePoints(count) + scalars, _ := GenerateScalars(count) + + a, e := MsmBatchBN254(&points, &scalars, batchSize, 0) + + if e != nil { + t.Errorf("MsmBatchBN254 returned an error: %v", e) + } + + if len(a) != batchSize { + t.Errorf("Expected length %d, but got %d", batchSize, len(a)) + } + } + } +} + +func BenchmarkMSM(b *testing.B) { + LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26} + + for _, logMsmSize := range LOG_MSM_SIZES { + msmSize := 1 << logMsmSize + points, _ := GeneratePoints(msmSize) + scalars, _ := GenerateScalars(msmSize) + b.Run(fmt.Sprintf("MSM %d", logMsmSize), func(b *testing.B) { + for n := 0; n < b.N; n++ { + out := new(PointBN254) + _, e := MsmBN254(out, points, scalars, 0) + + if e != nil { + panic("Error occured") + } + } + }) + } +} + +// G2 + +func randG2Jac() (bn254.G2Jac, error) { + var point bn254.G2Jac + var scalar fr.Element + + _, err := scalar.SetRandom() + if err != nil { + return point, err + } + + _, genG2Jac, _, _ := bn254.Generators() + + randomBigInt := big.NewInt(1000) + + point.ScalarMultiplication(&genG2Jac, scalar.BigInt(randomBigInt)) + return point, nil +} + +func GenerateG2Points(count int) ([]G2PointAffine, []bn254.G2Affine) { + // Declare a slice of integers + var points []G2PointAffine + var pointsAffine []bn254.G2Affine + + // populate the slice + for i := 0; i < count; i++ { + gnarkP, _ := randG2Jac() + + var p G2PointAffine + p.FromGnarkJac(&gnarkP) + + var gp bn254.G2Affine + gp.FromJacobian(&gnarkP) + pointsAffine = append(pointsAffine, gp) + points = append(points, p) + } + + return points, pointsAffine +} + +func TestMsmG2BN254(t *testing.T) { + for _, v := range []int{24} { + count := 1 << v + points, gnarkPoints := GenerateG2Points(count) + fmt.Print("Finished generating points\n") + scalars, gnarkScalars := GenerateScalars(count) + fmt.Print("Finished generating scalars\n") + + out := new(G2Point) + _, e := MsmG2BN254(out, points, scalars, 0) + assert.Equal(t, e, nil, "error should be nil") + + var result G2PointAffine + var bn254AffineLib bn254.G2Affine + + gResult, _ := bn254AffineLib.MultiExp(gnarkPoints, gnarkScalars, ecc.MultiExpConfig{}) + + result.FromGnarkAffine(gResult) + + pp := result.ToProjective() + assert.True(t, out.eqg2(&pp)) + //assert.Equal(t, out, result.ToProjective()) + } +} + +func BenchmarkMsmG2BN254(b *testing.B) { + LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26} + + for _, logMsmSize := range LOG_MSM_SIZES { + msmSize := 1 << logMsmSize + points, _ := GenerateG2Points(msmSize) + scalars, _ := GenerateScalars(msmSize) + b.Run(fmt.Sprintf("MSM G2 %d", logMsmSize), func(b *testing.B) { + for n := 0; n < b.N; n++ { + out := new(G2Point) + _, e := MsmG2BN254(out, points, scalars, 0) + + if e != nil { + panic("Error occured") + } + } + }) + } +} + +func TestCommitG2MSM(t *testing.T) { + for _, v := range []int{24} { + count := 1 << v + + points, gnarkPoints := GenerateG2Points(count) + fmt.Print("Finished generating points\n") + scalars, gnarkScalars := GenerateScalars(count) + fmt.Print("Finished generating scalars\n") + + var sizeCheckG2PointAffine G2PointAffine + inputPointsBytes := count * int(unsafe.Sizeof(sizeCheckG2PointAffine)) + + var sizeCheckG2Point G2Point + out_d, _ := goicicle.CudaMalloc(int(unsafe.Sizeof(sizeCheckG2Point))) + + points_d, _ := goicicle.CudaMalloc(inputPointsBytes) + goicicle.CudaMemCpyHtoD[G2PointAffine](points_d, points, inputPointsBytes) + + scalarBytes := count * 32 + scalars_d, _ := goicicle.CudaMalloc(scalarBytes) + goicicle.CudaMemCpyHtoD[ScalarField](scalars_d, scalars, scalarBytes) + + startTime := time.Now() + e := CommitG2(out_d, scalars_d, points_d, count) + fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds()) + + outHost := make([]G2Point, 1) + goicicle.CudaMemCpyDtoH[G2Point](outHost, out_d, int(unsafe.Sizeof(sizeCheckG2Point))) + + assert.Equal(t, e, 0, "error should be 0") + fmt.Print("Finished icicle MSM\n") + + var bn254AffineLib bn254.G2Affine + + gResult, _ := bn254AffineLib.MultiExp(gnarkPoints, gnarkScalars, ecc.MultiExpConfig{}) + fmt.Print("Finished Gnark MSM\n") + var resultGnark G2PointAffine + resultGnark.FromGnarkAffine(gResult) + + resultGnarkProjective := resultGnark.ToProjective() + assert.Equal(t, len(outHost), 1) + result := outHost[0] + + assert.True(t, result.eqg2(&resultGnarkProjective)) + } +} + +func TestBatchG2MSM(t *testing.T) { + for _, batchPow2 := range []int{2, 4} { + for _, pow2 := range []int{4, 6} { + msmSize := 1 << pow2 + batchSize := 1 << batchPow2 + count := msmSize * batchSize + + points, _ := GenerateG2Points(count) + scalars, _ := GenerateScalars(count) + + a, e := MsmG2BatchBN254(&points, &scalars, batchSize, 0) + + if e != nil { + t.Errorf("MsmBatchBN254 returned an error: %v", e) + } + + if len(a) != batchSize { + t.Errorf("Expected length %d, but got %d", batchSize, len(a)) + } + } + } +} diff --git a/goicicle/curves/bn254/ntt.go b/goicicle/curves/bn254/ntt.go new file mode 100644 index 000000000..313481692 --- /dev/null +++ b/goicicle/curves/bn254/ntt.go @@ -0,0 +1,202 @@ +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +package bn254 + +// #cgo CFLAGS: -I../../../icicle/curves/bn254/ +// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn254 +// #include "ntt.h" +import "C" +import ( + "errors" + "fmt" + "unsafe" + + "github.com/ingonyama-zk/icicle/goicicle" +) + +const ( + NONE = 0 + DIF = 1 + DIT = 2 +) + +func NttBN254(scalars *[]ScalarField, isInverse bool, decimation int, deviceId int) uint64 { + scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&(*scalars)[0])) + + ret := C.ntt_cuda_bn254(scalarsC, C.uint32_t(len(*scalars)), C.bool(isInverse), C.size_t(decimation), C.size_t(deviceId)) + + return uint64(ret) +} + +func NttBatchBN254(scalars *[]ScalarField, isInverse bool, batchSize, deviceId int) uint64 { + scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&(*scalars)[0])) + isInverseC := C.bool(isInverse) + batchSizeC := C.uint32_t(batchSize) + deviceIdC := C.size_t(deviceId) + + ret := C.ntt_batch_cuda_bn254(scalarsC, C.uint32_t(len(*scalars)), batchSizeC, isInverseC, deviceIdC) + + return uint64(ret) +} + +func EcNttBN254(values *[]PointBN254, isInverse bool, deviceId int) uint64 { + valuesC := (*C.BN254_projective_t)(unsafe.Pointer(&(*values)[0])) + deviceIdC := C.size_t(deviceId) + isInverseC := C.bool(isInverse) + n := C.uint32_t(len(*values)) + + ret := C.ecntt_cuda_bn254(valuesC, n, isInverseC, deviceIdC) + + return uint64(ret) +} + +func EcNttBatchBN254(values *[]PointBN254, isInverse bool, batchSize, deviceId int) uint64 { + valuesC := (*C.BN254_projective_t)(unsafe.Pointer(&(*values)[0])) + deviceIdC := C.size_t(deviceId) + isInverseC := C.bool(isInverse) + n := C.uint32_t(len(*values)) + batchSizeC := C.uint32_t(batchSize) + + ret := C.ecntt_batch_cuda_bn254(valuesC, n, batchSizeC, isInverseC, deviceIdC) + + return uint64(ret) +} + +func GenerateTwiddles(d_size int, log_d_size int, inverse bool) (up unsafe.Pointer, err error) { + domain_size := C.uint32_t(d_size) + logn := C.uint32_t(log_d_size) + is_inverse := C.bool(inverse) + + dp := C.build_domain_cuda_bn254(domain_size, logn, is_inverse, 0, 0) + + if dp == nil { + err = errors.New("nullptr returned from generating twiddles") + return unsafe.Pointer(nil), err + } + + return unsafe.Pointer(dp), nil +} + +// Reverses d_scalars in-place +func ReverseScalars(d_scalars unsafe.Pointer, len int) (int, error) { + scalarsC := (*C.BN254_scalar_t)(d_scalars) + lenC := C.int(len) + if success := C.reverse_order_scalars_cuda_bn254(scalarsC, lenC, 0, 0); success != 0 { + return -1, errors.New("reversing failed") + } + return 0, nil +} + +func Interpolate(scalars, twiddles, cosetPowers unsafe.Pointer, size int, isCoset bool) unsafe.Pointer { + size_d := size * 32 + dp, err := goicicle.CudaMalloc(size_d) + + if err != nil { + return nil + } + + d_out := (*C.BN254_scalar_t)(dp) + scalarsC := (*C.BN254_scalar_t)(scalars) + twiddlesC := (*C.BN254_scalar_t)(twiddles) + cosetPowersC := (*C.BN254_scalar_t)(cosetPowers) + sizeC := C.uint(size) + + var ret C.int + if isCoset { + ret = C.interpolate_scalars_on_coset_cuda_bn254(d_out, scalarsC, twiddlesC, sizeC, cosetPowersC, 0, 0) + } else { + ret = C.interpolate_scalars_cuda_bn254(d_out, scalarsC, twiddlesC, sizeC, 0, 0) + } + if ret != 0 { + fmt.Print("error interpolating") + } + + return unsafe.Pointer(d_out) +} + +func Evaluate(scalars_out, scalars, twiddles, coset_powers unsafe.Pointer, scalars_size, twiddles_size int, isCoset bool) int { + scalars_outC := (*C.BN254_scalar_t)(scalars_out) + scalarsC := (*C.BN254_scalar_t)(scalars) + twiddlesC := (*C.BN254_scalar_t)(twiddles) + coset_powersC := (*C.BN254_scalar_t)(coset_powers) + sizeC := C.uint(scalars_size) + twiddlesC_size := C.uint(twiddles_size) + + var ret C.int + if isCoset { + ret = C.evaluate_scalars_on_coset_cuda_bn254(scalars_outC, scalarsC, twiddlesC, twiddlesC_size, sizeC, coset_powersC, 0, 0) + } else { + ret = C.evaluate_scalars_cuda_bn254(scalars_outC, scalarsC, twiddlesC, twiddlesC_size, sizeC, 0, 0) + } + + if ret != 0 { + fmt.Print("error interpolating") + return -1 + } + + return 0 +} + +func VecScalarAdd(in1_d, in2_d unsafe.Pointer, size int) int { + in1_dC := (*C.BN254_scalar_t)(in1_d) + in2_dC := (*C.BN254_scalar_t)(in2_d) + sizeC := C.uint(size) + + ret := C.add_scalars_cuda_bn254(in1_dC, in1_dC, in2_dC, sizeC, 0) + + if ret != 0 { + fmt.Print("error adding scalar vectors") + return -1 + } + + return 0 +} + +func VecScalarSub(in1_d, in2_d unsafe.Pointer, size int) int { + in1_dC := (*C.BN254_scalar_t)(in1_d) + in2_dC := (*C.BN254_scalar_t)(in2_d) + sizeC := C.uint(size) + + ret := C.sub_scalars_cuda_bn254(in1_dC, in1_dC, in2_dC, sizeC, 0) + + if ret != 0 { + fmt.Print("error subtracting scalar vectors") + return -1 + } + + return 0 +} + +func ToMontgomery(d_scalars unsafe.Pointer, len int) (int, error) { + scalarsC := (*C.BN254_scalar_t)(d_scalars) + lenC := C.uint(len) + if success := C.to_montgomery_scalars_cuda_bn254(scalarsC, lenC, 0); success != 0 { + return -1, errors.New("reversing failed") + } + return 0, nil +} + +func FromMontgomery(d_scalars unsafe.Pointer, len int) (int, error) { + scalarsC := (*C.BN254_scalar_t)(d_scalars) + lenC := C.uint(len) + if success := C.from_montgomery_scalars_cuda_bn254(scalarsC, lenC, 0); success != 0 { + return -1, errors.New("reversing failed") + } + return 0, nil +} + + diff --git a/goicicle/curves/bn254/ntt_test.go b/goicicle/curves/bn254/ntt_test.go new file mode 100644 index 000000000..9c53afb3d --- /dev/null +++ b/goicicle/curves/bn254/ntt_test.go @@ -0,0 +1,219 @@ +package bn254 + +import ( + "fmt" + "reflect" + "testing" + + "github.com/consensys/gnark-crypto/ecc/bn254/fr" + "github.com/consensys/gnark-crypto/ecc/bn254/fr/fft" + "github.com/stretchr/testify/assert" +) + +func TestNttBN254BBB(t *testing.T) { + count := 1 << 20 + scalars, frScalars := GenerateScalars(count) + + nttResult := make([]ScalarField, len(scalars)) // Make a new slice with the same length + copy(nttResult, scalars) + + assert.Equal(t, nttResult, scalars) + NttBatchBN254(&nttResult, false, count, 0) + assert.NotEqual(t, nttResult, scalars) + + domain := fft.NewDomain(uint64(len(scalars))) + // DIT WITH NO INVERSE + // DIF WITH INVERSE + domain.FFT(frScalars, fft.DIT) //DIF + + nttResultTransformedToGnark := make([]fr.Element, len(scalars)) // Make a new slice with the same length + + for k, v := range nttResult { + nttResultTransformedToGnark[k] = *v.toGnarkFr() + } + + assert.Equal(t, nttResultTransformedToGnark, frScalars) +} + +func TestNttBN254CompareToGnarkDIF(t *testing.T) { + count := 1 << 2 + scalars, frScalars := GenerateScalars(count) + + nttResult := make([]ScalarField, len(scalars)) // Make a new slice with the same length + copy(nttResult, scalars) + + assert.Equal(t, nttResult, scalars) + NttBN254(&nttResult, false, DIF, 0) + assert.NotEqual(t, nttResult, scalars) + + domain := fft.NewDomain(uint64(len(scalars))) + // DIT WITH NO INVERSE + // DIF WITH INVERSE + domain.FFT(frScalars, fft.DIF) //DIF + + nttResultTransformedToGnark := make([]fr.Element, len(scalars)) // Make a new slice with the same length + + for k, v := range nttResult { + nttResultTransformedToGnark[k] = *v.toGnarkFr() + } + + assert.Equal(t, nttResultTransformedToGnark, frScalars) +} + +func TestNttBN254CompareToGnarkDIT(t *testing.T) { + count := 1 << 2 + scalars, frScalars := GenerateScalars(count) + + nttResult := make([]ScalarField, len(scalars)) // Make a new slice with the same length + copy(nttResult, scalars) + + assert.Equal(t, nttResult, scalars) + NttBN254(&nttResult, false, DIT, 0) + assert.NotEqual(t, nttResult, scalars) + + domain := fft.NewDomain(uint64(len(scalars))) + // DIT WITH NO INVERSE + // DIF WITH INVERSE + domain.FFT(frScalars, fft.DIT) //DIF + + nttResultTransformedToGnark := make([]fr.Element, len(scalars)) // Make a new slice with the same length + + for k, v := range nttResult { + nttResultTransformedToGnark[k] = *v.toGnarkFr() + } + + assert.Equal(t, nttResultTransformedToGnark, frScalars) +} + +func TestINttBN254CompareToGnarkDIT(t *testing.T) { + count := 1 << 3 + scalars, frScalars := GenerateScalars(count) + + nttResult := make([]ScalarField, len(scalars)) // Make a new slice with the same length + copy(nttResult, scalars) + + assert.Equal(t, nttResult, scalars) + NttBN254(&nttResult, true, DIT, 0) + assert.NotEqual(t, nttResult, scalars) + + frResScalars := make([]fr.Element, len(frScalars)) // Make a new slice with the same length + copy(frResScalars, frScalars) + + domain := fft.NewDomain(uint64(len(scalars))) + domain.FFTInverse(frResScalars, fft.DIT) + + assert.NotEqual(t, frResScalars, frScalars) + + nttResultTransformedToGnark := make([]fr.Element, len(scalars)) // Make a new slice with the same length + + for k, v := range nttResult { + nttResultTransformedToGnark[k] = *v.toGnarkFr() + } + + assert.Equal(t, nttResultTransformedToGnark, frResScalars) +} + +func TestINttBN254CompareToGnarkDIF(t *testing.T) { + count := 1 << 3 + scalars, frScalars := GenerateScalars(count) + + nttResult := make([]ScalarField, len(scalars)) // Make a new slice with the same length + copy(nttResult, scalars) + + assert.Equal(t, nttResult, scalars) + NttBN254(&nttResult, true, DIF, 0) + assert.NotEqual(t, nttResult, scalars) + + domain := fft.NewDomain(uint64(len(scalars))) + domain.FFTInverse(frScalars, fft.DIF) + + nttResultTransformedToGnark := make([]fr.Element, len(scalars)) // Make a new slice with the same length + + for k, v := range nttResult { + nttResultTransformedToGnark[k] = *v.toGnarkFr() + } + + assert.Equal(t, nttResultTransformedToGnark, frScalars) +} + +func TestNttBN254(t *testing.T) { + count := 1 << 3 + + scalars, _ := GenerateScalars(count) + + nttResult := make([]ScalarField, len(scalars)) // Make a new slice with the same length + copy(nttResult, scalars) + + assert.Equal(t, nttResult, scalars) + NttBN254(&nttResult, false, NONE, 0) + assert.NotEqual(t, nttResult, scalars) + + inttResult := make([]ScalarField, len(nttResult)) + copy(inttResult, nttResult) + + assert.Equal(t, inttResult, nttResult) + NttBN254(&inttResult, true, NONE, 0) + assert.Equal(t, inttResult, scalars) +} + +func TestNttBatchBN254(t *testing.T) { + count := 1 << 5 + batches := 4 + + scalars, _ := GenerateScalars(count * batches) + + var scalarVecOfVec [][]ScalarField = make([][]ScalarField, 0) + + for i := 0; i < batches; i++ { + start := i * count + end := (i + 1) * count + batch := make([]ScalarField, len(scalars[start:end])) + copy(batch, scalars[start:end]) + scalarVecOfVec = append(scalarVecOfVec, batch) + } + + nttBatchResult := make([]ScalarField, len(scalars)) + copy(nttBatchResult, scalars) + + NttBatchBN254(&nttBatchResult, false, count, 0) + + var nttResultVecOfVec [][]ScalarField + + for i := 0; i < batches; i++ { + // Clone the slice + clone := make([]ScalarField, len(scalarVecOfVec[i])) + copy(clone, scalarVecOfVec[i]) + + // Add it to the result vector of vectors + nttResultVecOfVec = append(nttResultVecOfVec, clone) + + // Call the ntt_bn254 function + NttBN254(&nttResultVecOfVec[i], false, NONE, 0) + } + + assert.NotEqual(t, nttBatchResult, scalars) + + // Check that the ntt of each vec of scalars is equal to the intt of the specific batch + for i := 0; i < batches; i++ { + if !reflect.DeepEqual(nttResultVecOfVec[i], nttBatchResult[i*count:((i+1)*count)]) { + t.Errorf("ntt of vec of scalars not equal to intt of specific batch") + } + } +} + +func BenchmarkNTT(b *testing.B) { + LOG_NTT_SIZES := []int{12, 15, 20, 21, 22, 23, 24, 25, 26} + + for _, logNTTSize := range LOG_NTT_SIZES { + nttSize := 1 << logNTTSize + b.Run(fmt.Sprintf("NTT %d", logNTTSize), func(b *testing.B) { + scalars, _ := GenerateScalars(nttSize) + + nttResult := make([]ScalarField, len(scalars)) // Make a new slice with the same length + copy(nttResult, scalars) + for n := 0; n < b.N; n++ { + NttBN254(&nttResult, false, NONE, 0) + } + }) + } +} diff --git a/goicicle/curves/bn254/utils.go b/goicicle/curves/bn254/utils.go new file mode 100644 index 000000000..5186e79bf --- /dev/null +++ b/goicicle/curves/bn254/utils.go @@ -0,0 +1,34 @@ +package bn254 + +import ( + "encoding/binary" + "log" + "time" +) + +// Function to convert [8]uint32 to [4]uint64 +func ConvertUint32ArrToUint64Arr(arr32 [8]uint32) [4]uint64 { + var arr64 [4]uint64 + for i := 0; i < len(arr32); i += 2 { + arr64[i/2] = (uint64(arr32[i]) << 32) | uint64(arr32[i+1]) + } + return arr64 +} + +func ConvertUint64ArrToUint32Arr(arr64 [4]uint64) [8]uint32 { + var arr32 [8]uint32 + for i, v := range arr64 { + b := make([]byte, 8) + binary.LittleEndian.PutUint64(b, v) + + arr32[i*2] = binary.LittleEndian.Uint32(b[0:4]) + arr32[i*2+1] = binary.LittleEndian.Uint32(b[4:8]) + } + + return arr32 +} + +func TimeTrack(start time.Time, name string) { + elapsed := time.Since(start) + log.Printf("%s took %s", name, elapsed) +} diff --git a/goicicle/curves/bn254/utils_test.go b/goicicle/curves/bn254/utils_test.go new file mode 100644 index 000000000..d9f555260 --- /dev/null +++ b/goicicle/curves/bn254/utils_test.go @@ -0,0 +1,81 @@ +package bn254 + +import ( + "testing" +) + +func TestConvertUint32ArrToUint64Arr(t *testing.T) { + testCases := []struct { + name string + input [8]uint32 + want [4]uint64 + }{ + { + name: "Test with incremental array", + input: [8]uint32{1, 2, 3, 4, 5, 6, 7, 8}, + want: [4]uint64{4294967298, 12884901892, 21474836486, 30064771080}, + }, + { + name: "Test with all zeros", + input: [8]uint32{0, 0, 0, 0, 0, 0, 0, 0}, + want: [4]uint64{0, 0, 0, 0}, + }, + { + name: "Test with maximum uint32 values", + input: [8]uint32{4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295}, + want: [4]uint64{18446744073709551615, 18446744073709551615, 18446744073709551615, 18446744073709551615}, + }, + { + name: "Test with alternating min and max uint32 values", + input: [8]uint32{0, 4294967295, 0, 4294967295, 0, 4294967295, 0, 4294967295}, + want: [4]uint64{4294967295, 4294967295, 4294967295, 4294967295}, + }, + { + name: "Test with alternating max and min uint32 values", + input: [8]uint32{4294967295, 0, 4294967295, 0, 4294967295, 0, 4294967295, 0}, + want: [4]uint64{18446744069414584320, 18446744069414584320, 18446744069414584320, 18446744069414584320}, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + got := ConvertUint32ArrToUint64Arr(tc.input) + if got != tc.want { + t.Errorf("got %v, want %v", got, tc.want) + } + }) + } +} + +func TestConvertUint64ArrToUint32Arr(t *testing.T) { + testCases := []struct { + name string + input [4]uint64 + expected [8]uint32 + }{ + { + name: "test one", + input: [4]uint64{1, 2, 3, 4}, + expected: [8]uint32{1, 0, 2, 0, 3, 0, 4, 0}, + }, + { + name: "test two", + input: [4]uint64{100, 200, 300, 400}, + expected: [8]uint32{100, 0, 200, 0, 300, 0, 400, 0}, + }, + { + name: "test three", + input: [4]uint64{1000, 2000, 3000, 4000}, + expected: [8]uint32{1000, 0, 2000, 0, 3000, 0, 4000, 0}, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + got := ConvertUint64ArrToUint32Arr(tc.input) + if got != tc.expected { + t.Errorf("got %v, want %v", got, tc.expected) + } + }) + } +} diff --git a/goicicle/curves/bn254/vec_mod.go b/goicicle/curves/bn254/vec_mod.go new file mode 100644 index 000000000..348e445e6 --- /dev/null +++ b/goicicle/curves/bn254/vec_mod.go @@ -0,0 +1,41 @@ +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +package bn254 + +// #cgo CFLAGS: -I../../../icicle/curves/bn254/ +// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn254 +// #include "ve_mod_mult.h" +import "C" +import ( + "fmt" + "unsafe" +) + +func VecScalarMulMod(scalarVec1, scalarVec2 unsafe.Pointer, size int) int { + scalarVec1C := (*C.BN254_scalar_t)(scalarVec1) + scalarVec2C := (*C.BN254_scalar_t)(scalarVec2) + sizeC := C.size_t(size) + + ret := C.vec_mod_mult_device_scalar_bn254(scalarVec1C, scalarVec2C, sizeC, 0) + + if ret != 0 { + fmt.Print("error multiplying scalar vectors") + return -1 + } + + return 0 +} diff --git a/goicicle/go.mod b/goicicle/go.mod new file mode 100644 index 000000000..13f279ad4 --- /dev/null +++ b/goicicle/go.mod @@ -0,0 +1,20 @@ +module github.com/ingonyama-zk/icicle/goicicle + +go 1.20 + +require github.com/consensys/gnark-crypto v0.11.0 + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) + +require ( + github.com/bits-and-blooms/bitset v1.5.0 // indirect + github.com/consensys/bavard v0.1.13 + github.com/mmcloughlin/addchain v0.4.0 // indirect + github.com/stretchr/testify v1.8.3 + golang.org/x/sys v0.2.0 // indirect + rsc.io/tmplfunc v0.0.3 // indirect +) diff --git a/goicicle/go.sum b/goicicle/go.sum new file mode 100644 index 000000000..91618224f --- /dev/null +++ b/goicicle/go.sum @@ -0,0 +1,25 @@ +github.com/bits-and-blooms/bitset v1.5.0 h1:NpE8frKRLGHIcEzkR+gZhiioW1+WbYV6fKwD6ZIpQT8= +github.com/bits-and-blooms/bitset v1.5.0/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA= +github.com/consensys/bavard v0.1.13 h1:oLhMLOFGTLdlda/kma4VOJazblc7IM5y5QPd2A/YjhQ= +github.com/consensys/bavard v0.1.13/go.mod h1:9ItSMtA/dXMAiL7BG6bqW2m3NdSEObYWoH223nGHukI= +github.com/consensys/gnark-crypto v0.11.0 h1:QqzHQlwEqlQr5jfWblGDkwlKHpT+4QodYqqExkAtyks= +github.com/consensys/gnark-crypto v0.11.0/go.mod h1:Iq/P3HHl0ElSjsg2E1gsMwhAyxnxoKK5nVyZKd+/KhU= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/google/subcommands v1.2.0/go.mod h1:ZjhPrFU+Olkh9WazFPsl27BQ4UPiG37m3yTrtFlrHVk= +github.com/leanovate/gopter v0.2.9 h1:fQjYxZaynp97ozCzfOyOuAGOU4aU/z37zf/tOujFk7c= +github.com/mmcloughlin/addchain v0.4.0 h1:SobOdjm2xLj1KkXN5/n0xTIWyZA2+s99UCY1iPfkHRY= +github.com/mmcloughlin/addchain v0.4.0/go.mod h1:A86O+tHqZLMNO4w6ZZ4FlVQEadcoqkyU72HC5wJ4RlU= +github.com/mmcloughlin/profile v0.1.1/go.mod h1:IhHD7q1ooxgwTgjxQYkACGA77oFTDdFVejUS1/tS/qU= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.8.3 h1:RP3t2pwF7cMEbC1dqtB6poj3niw/9gnV4Cjg5oW5gtY= +github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +golang.org/x/sys v0.2.0 h1:ljd4t30dBnAvMZaQCevtY0xLLD0A+bRZXbgLMLU1F/A= +golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +rsc.io/tmplfunc v0.0.3 h1:53XFQh69AfOa8Tw0Jm7t+GV7KZhOi6jzsCzTtKbMvzU= +rsc.io/tmplfunc v0.0.3/go.mod h1:AG3sTPzElb1Io3Yg4voV9AGZJuleGAwaVRxL9M49PhA= diff --git a/goicicle/goicicle.go b/goicicle/goicicle.go new file mode 100644 index 000000000..834948304 --- /dev/null +++ b/goicicle/goicicle.go @@ -0,0 +1,58 @@ +package goicicle + +// This file implements CUDA driver context management + +// #cgo CFLAGS: -I /usr/loca/cuda/include +// #cgo LDFLAGS: -L/usr/local/cuda/lib64 -lcudart +/* +#include +#include +*/ +import "C" + +import ( + "errors" + "unsafe" +) + +// Version returns the version of the CUDA driver +// func Version() int { +// var v C.int +// if err := C.cuDriverGetVersion(&v); err != 0 { +// return -1 +// } +// return int(v) +// } + +func CudaMalloc(size int) (dp unsafe.Pointer, err error) { + var p C.void + dp = unsafe.Pointer(&p) + if err := C.cudaMalloc(&dp, C.size_t(size)); err != 0 { + return nil, errors.New("could not create memory space") + } + return dp, nil +} + +func CudaFree(dp unsafe.Pointer) int { + if err := C.cudaFree(dp); err != 0 { + return -1 + } + return 0 +} + +func CudaMemCpyHtoD[T any](dst_d unsafe.Pointer, src []T, size int) int { + src_c := unsafe.Pointer(&src[0]) + if err := C.cudaMemcpy(dst_d, src_c, C.size_t(size), 1); err != 0 { + return -1 + } + return 0 +} + +func CudaMemCpyDtoH[T any](dst []T, src_d unsafe.Pointer, size int) int { + dst_c := unsafe.Pointer(&dst[0]) + + if err := C.cudaMemcpy(dst_c, src_d, C.size_t(size), 2); err != 0 { + return -1 + } + return 0 +} diff --git a/goicicle/templates/curves/curves.go b/goicicle/templates/curves/curves.go new file mode 100644 index 000000000..e56dbbace --- /dev/null +++ b/goicicle/templates/curves/curves.go @@ -0,0 +1,37 @@ +package config + +type Curve struct { + CurveName string + PackageName string + SharedLib string + Prefix string + ScalarSize int + BaseSize int +} + +var BN_254 = Curve{ + CurveName: "BN254", + PackageName: "bn254", + SharedLib: "-lbn254", + Prefix: "bn254", + ScalarSize: 8, + BaseSize: 8, +} + +var BLS_12_377 = Curve{ + CurveName: "BLS12377", + PackageName: "bls12377", + SharedLib: "-lbn12_377", + Prefix: "bls12_377", + ScalarSize: 8, + BaseSize: 12, +} + +var BLS_12_381 = Curve{ + CurveName: "BLS12381", + PackageName: "bls12381", + SharedLib: "-lbn12_381", + Prefix: "bls12_381", + ScalarSize: 8, + BaseSize: 12, +} diff --git a/goicicle/templates/curves/g1.go.tmpl b/goicicle/templates/curves/g1.go.tmpl new file mode 100644 index 000000000..db7e7e8e4 --- /dev/null +++ b/goicicle/templates/curves/g1.go.tmpl @@ -0,0 +1,469 @@ +import ( + "unsafe" + + "encoding/binary" + "fmt" + + {{ template "import_ecc" . }} + {{ template "import_fp" . }} + {{ template "import_fr" . }} +) + +// #cgo CFLAGS: -I${SRCDIR}/icicle/curves/{{.PackageName}}/ +// #cgo LDFLAGS: -L${SRCDIR}/../../ {{.SharedLib}} +// #include "c_api.h" +// #include "ve_mod_mult.h" +import "C" + +const SCALAR_SIZE = {{.ScalarSize}} +const BASE_SIZE = {{.BaseSize}} + +type ScalarField struct { + s [SCALAR_SIZE]uint32 +} + +type BaseField struct { + s [BASE_SIZE]uint32 +} + +type Field interface { + toGnarkFr() *fr.Element +} + +/* + * Common Constrctors + */ + +func NewFieldZero[T BaseField | ScalarField]() *T { + var field T + + return &field +} + +func NewFieldFromFrGnark[T BaseField | ScalarField](element fr.Element) *T { + s := ConvertUint64ArrToUint32Arr(element.Bits()) // get non-montgomry + + return &T{s} +} + +func NewFieldFromFpGnark[T BaseField | ScalarField](element fp.Element) *T { + s := ConvertUint64ArrToUint32Arr(element.Bits()) // get non-montgomry + + return &T{s} +} + +/* + * BaseField Constrctors + */ + +func NewBaseFieldOne() *BaseField { + var s [BASE_SIZE]uint32 + + s[0] = 1 + + return &BaseField{s} +} + +func BaseFieldFromLimbs(limbs [BASE_SIZE]uint32) *BaseField { + bf := NewFieldZero[BaseField]() + copy(bf.s[:], limbs[:]) + + return bf +} + +/* + * BaseField methods + */ + +func (f *BaseField) limbs() [BASE_SIZE]uint32 { + return f.s +} + +func (f *BaseField) toBytesLe() []byte { + bytes := make([]byte, len(f.s)*4) + for i, v := range f.s { + binary.LittleEndian.PutUint32(bytes[i*4:], v) + } + + return bytes +} + +func (f *BaseField) toGnarkFr() *fr.Element { + fb := f.toBytesLe() + var b32 [32]byte + copy(b32[:], fb[:32]) + + v, e := fr.LittleEndian.Element(&b32) + + if e != nil { + panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e)) + } + + return &v +} + +func (f *BaseField) toGnarkFp() *fp.Element { + fb := f.toBytesLe() + var b32 [32]byte + copy(b32[:], fb[:32]) + + v, e := fp.LittleEndian.Element(&b32) + + if e != nil { + panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e)) + } + + return &v +} + +/* + * ScalarField methods + */ + +func NewScalarFieldOne() *ScalarField { + var s [SCALAR_SIZE]uint32 + + s[0] = 1 + + return &ScalarField{s} +} + +/* + * ScalarField methods + */ + +func (f *ScalarField) limbs() [SCALAR_SIZE]uint32 { + return f.s +} + +func (f *ScalarField) toBytesLe() []byte { + bytes := make([]byte, len(f.s)*4) + for i, v := range f.s { + binary.LittleEndian.PutUint32(bytes[i*4:], v) + } + + return bytes +} + +func (f ScalarField) toGnarkFr() *fr.Element { + fb := f.toBytesLe() + var b32 [32]byte + copy(b32[:], fb[:32]) + + v, e := fr.LittleEndian.Element(&b32) + + if e != nil { + panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e)) + } + + return &v +} + +func (f *ScalarField) toGnarkFp() *fp.Element { + fb := f.toBytesLe() + var b32 [32]byte + copy(b32[:], fb[:32]) + + v, e := fp.LittleEndian.Element(&b32) + + if e != nil { + panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e)) + } + + return &v +} + +/* + * Point{{.CurveName}} + */ + +type Point{{.CurveName}} struct { + x, y, z BaseField +} + +func NewPoint{{.CurveName}}Zero() *Point{{.CurveName}} { + return &Point{{.CurveName}}{ + x: *NewFieldZero[BaseField](), + y: *NewBaseFieldOne(), + z: *NewFieldZero[BaseField](), + } +} + +func (p *Point{{.CurveName}}) eq(pCompare *Point{{.CurveName}}) bool { + // Cast *Point{{.CurveName}} to *C.{{.CurveName}}_projective_t + // The unsafe.Pointer cast is necessary because Go doesn't allow direct casts + // between different pointer types. + // It's your responsibility to ensure that the types are compatible. + pC := (*C.{{.CurveName}}_projective_t)(unsafe.Pointer(p)) + pCompareC := (*C.{{.CurveName}}_projective_t)(unsafe.Pointer(pCompare)) + + // Call the C function + // The C function doesn't keep any references to the data, + // so it's fine if the Go garbage collector moves or deletes the data later. + return bool(C.eq_{{.PackageName}}(pC, pCompareC)) +} + +func (p *Point{{.CurveName}}) strip_z() *PointAffineNoInfinity{{.CurveName}} { + return &PointAffineNoInfinity{{.CurveName}}{ + x: p.x, + y: p.y, + } +} + +func (p *Point{{.CurveName}}) toGnarkAffine() *{{.PackageName}}.G1Affine { + px := p.x.toGnarkFp() + py := p.y.toGnarkFp() + pz := p.z.toGnarkFp() + + zInv := new(fp.Element) + x := new(fp.Element) + y := new(fp.Element) + + zInv.Inverse(pz) + + x.Mul(px, zInv) + y.Mul(py, zInv) + + return &{{.PackageName}}.G1Affine{X: *x, Y: *y} +} + +func (p *Point{{.CurveName}}) ToGnarkJac() *{{.PackageName}}.G1Jac { + var p1 {{.PackageName}}.G1Jac + p1.FromAffine(p.toGnarkAffine()) + + return &p1 +} + +func Point{{.CurveName}}FromG1AffineGnark(gnark *{{.PackageName}}.G1Affine) *Point{{.CurveName}} { + point := Point{{.CurveName}}{ + x: *NewFieldFromFpGnark[BaseField](gnark.X), + y: *NewFieldFromFpGnark[BaseField](gnark.Y), + z: *NewBaseFieldOne(), + } + + return &point +} + +// converts jac fromat to projective +func Point{{.CurveName}}FromJacGnark(gnark *{{.PackageName}}.G1Jac) *Point{{.CurveName}} { + var pointAffine {{.PackageName}}.G1Affine + pointAffine.FromJacobian(gnark) + + point := Point{{.CurveName}}{ + x: *NewFieldFromFpGnark[BaseField](pointAffine.X), + y: *NewFieldFromFpGnark[BaseField](pointAffine.Y), + z: *NewBaseFieldOne(), + } + + return &point +} + +func Point{{.CurveName}}fromLimbs(x, y, z *[]uint32) *Point{{.CurveName}} { + return &Point{{.CurveName}}{ + x: *BaseFieldFromLimbs(getFixedLimbs(x)), + y: *BaseFieldFromLimbs(getFixedLimbs(y)), + z: *BaseFieldFromLimbs(getFixedLimbs(z)), + } +} + +/* + * PointAffineNoInfinity{{.CurveName}} + */ + +type PointAffineNoInfinity{{.CurveName}} struct { + x, y BaseField +} + +func NewPointAffineNoInfinity{{.CurveName}}Zero() *PointAffineNoInfinity{{.CurveName}} { + return &PointAffineNoInfinity{{.CurveName}}{ + x: *NewFieldZero[BaseField](), + y: *NewFieldZero[BaseField](), + } +} + +func (p *PointAffineNoInfinity{{.CurveName}}) toProjective() *Point{{.CurveName}} { + return &Point{{.CurveName}}{ + x: p.x, + y: p.y, + z: *NewBaseFieldOne(), + } +} + +func (p *PointAffineNoInfinity{{.CurveName}}) toGnarkAffine() *{{.PackageName}}.G1Affine { + return p.toProjective().toGnarkAffine() +} + +func PointAffineNoInfinity{{.CurveName}}FromLimbs(x, y *[]uint32) *PointAffineNoInfinity{{.CurveName}} { + return &PointAffineNoInfinity{{.CurveName}}{ + x: *BaseFieldFromLimbs(getFixedLimbs(x)), + y: *BaseFieldFromLimbs(getFixedLimbs(y)), + } +} + +/* + * Multiplication + */ + +func MultiplyVec(a []Point{{.CurveName}}, b []ScalarField, deviceID int) { + if len(a) != len(b) { + panic("a and b have different lengths") + } + + pointsC := (*C.{{.CurveName}}_projective_t)(unsafe.Pointer(&a[0])) + scalarsC := (*C.{{.CurveName}}_scalar_t)(unsafe.Pointer(&b[0])) + deviceIdC := C.size_t(deviceID) + nElementsC := C.size_t(len(a)) + + C.vec_mod_mult_point_{{.PackageName}}(pointsC, scalarsC, nElementsC, deviceIdC) +} + +func MultiplyScalar(a []ScalarField, b []ScalarField, deviceID int) { + if len(a) != len(b) { + panic("a and b have different lengths") + } + + aC := (*C.{{.CurveName}}_scalar_t)(unsafe.Pointer(&a[0])) + bC := (*C.{{.CurveName}}_scalar_t)(unsafe.Pointer(&b[0])) + deviceIdC := C.size_t(deviceID) + nElementsC := C.size_t(len(a)) + + C.vec_mod_mult_scalar_{{.PackageName}}(aC, bC, nElementsC, deviceIdC) +} + +// Multiply a matrix by a scalar: +// +// `a` - flattenned matrix; +// `b` - vector to multiply `a` by; +func MultiplyMatrix(a []ScalarField, b []ScalarField, deviceID int) { + c := make([]ScalarField, len(b)) + for i := range c { + c[i] = *NewFieldZero[ScalarField]() + } + + aC := (*C.{{.CurveName}}_scalar_t)(unsafe.Pointer(&a[0])) + bC := (*C.{{.CurveName}}_scalar_t)(unsafe.Pointer(&b[0])) + cC := (*C.{{.CurveName}}_scalar_t)(unsafe.Pointer(&c[0])) + deviceIdC := C.size_t(deviceID) + nElementsC := C.size_t(len(a)) + + C.matrix_vec_mod_mult_{{.PackageName}}(aC, bC, cC, nElementsC, deviceIdC) +} + +/* + * Utils + */ + +func getFixedLimbs(slice *[]uint32) [BASE_SIZE]uint32 { + if len(*slice) <= BASE_SIZE { + limbs := [BASE_SIZE]uint32{} + copy(limbs[:len(*slice)], *slice) + return limbs + } + + panic("slice has too many elements") +} + +func BatchConvertFromFrGnark[T BaseField | ScalarField](elements []fr.Element) []T { + var newElements []T + for _, e := range elements { + converted := NewFieldFromFrGnark[T](e) + newElements = append(newElements, *converted) + } + + return newElements +} + +func BatchConvertFromFrGnarkThreaded[T BaseField | ScalarField](elements []fr.Element, routines int) []T { + var newElements []T + + if routines > 1 { + channels := make([]chan []T, routines) + for i := 0; i < routines; i ++ { + channels[i] = make(chan []T, 1) + } + + convert := func(elements []fr.Element, chanIndex int) { + var convertedElements []T + for _, e := range elements { + converted := NewFieldFromFrGnark[T](e) + convertedElements = append(convertedElements, *converted) + } + + channels[chanIndex] <- convertedElements + } + + batchLen := len(elements)/routines + for i := 0; i < routines; i ++ { + elemsToConv := elements[batchLen*i:batchLen*(i+1)] + go convert(elemsToConv, i) + } + + for i := 0; i < routines; i ++ { + newElements = append(newElements, <-channels[i]...) + } + } else { + for _, e := range elements { + converted := NewFieldFromFrGnark[T](e) + newElements = append(newElements, *converted) + } + } + + return newElements +} + +func BatchConvertToFrGnark[T Field](elements []T) []fr.Element { + var newElements []fr.Element + for _, e := range elements { + converted := e.toGnarkFr() + newElements = append(newElements, *converted) + } + + return newElements +} + +func BatchConvertToFrGnarkThreaded[T Field](elements []T, routines int) []fr.Element { + var newElements []fr.Element + + if routines > 1 { + channels := make([]chan []fr.Element, routines) + for i := 0; i < routines; i ++ { + channels[i] = make(chan []fr.Element, 1) + } + + convert := func(elements []T, chanIndex int) { + var convertedElements []fr.Element + for _, e := range elements { + converted := e.toGnarkFr() + convertedElements = append(convertedElements, *converted) + } + + channels[chanIndex] <- convertedElements + } + + batchLen := len(elements)/routines + for i := 0; i < routines; i ++ { + elemsToConv := elements[batchLen*i:batchLen*(i+1)] + go convert(elemsToConv, i) + } + + for i := 0; i < routines; i ++ { + newElements = append(newElements, <-channels[i]...) + } + } else { + for _, e := range elements { + converted := e.toGnarkFr() + newElements = append(newElements, *converted) + } + } + + return newElements +} + +func BatchConvertFromG1Affine(elements []{{.PackageName}}.G1Affine) []PointAffineNoInfinity{{.CurveName}} { + var newElements []PointAffineNoInfinity{{.CurveName}} + for _, e := range elements { + newElement := Point{{.CurveName}}FromG1AffineGnark(&e).strip_z() + newElements = append(newElements, *newElement) + } + return newElements +} diff --git a/goicicle/templates/curves/g2.go.tmpl b/goicicle/templates/curves/g2.go.tmpl new file mode 100644 index 000000000..89f736b6c --- /dev/null +++ b/goicicle/templates/curves/g2.go.tmpl @@ -0,0 +1,83 @@ +import ( + "unsafe" + {{ template "import_ecc" . }} +) + +// #cgo CFLAGS: -I${SRCDIR}/icicle/curves/{{toLower .CurveName}}/ +// #cgo LDFLAGS: -L${SRCDIR}/../../ {{.SharedLib}} +// #include "c_api.h" +// #include "ve_mod_mult.h" +import "C" + +func BatchConvertFromG2Affine(elements []{{.PackageName}}.G2Affine) []G2PointAffine { + var newElements []G2PointAffine + for _, gg2Affine := range elements { + var newElement G2PointAffine + newElement.FromGnarkAffine(&gg2Affine) + + newElements = append(newElements, newElement) + } + return newElements +} + +// G2 extension field + +type G2Element [4]uint64 + +type ExtentionField struct { + A0, A1 G2Element +} + +type G2PointAffine struct { + x, y ExtentionField +} + +type G2Point struct { + x, y, z ExtentionField +} + +func (p *G2Point) eqg2(pCompare *G2Point) bool { + // Cast *Point{{.CurveName}} to *C.{{.CurveName}}_projective_t + // The unsafe.Pointer cast is necessary because Go doesn't allow direct casts + // between different pointer types. + // It's your responsibility to ensure that the types are compatible. + pC := (*C.{{.CurveName}}_g2_projective_t)(unsafe.Pointer(p)) + pCompareC := (*C.{{.CurveName}}_g2_projective_t)(unsafe.Pointer(pCompare)) + + // Call the C function + // The C function doesn't keep any references to the data, + // so it's fine if the Go garbage collector moves or deletes the data later. + return bool(C.eq_g2_{{.PackageName}}(pC, pCompareC)) +} + +func (p *G2PointAffine) ToProjective() G2Point { + return G2Point{ + x: p.x, + y: p.y, + z: ExtentionField{ + A0: G2Element{1, 0, 0, 0}, + A1: G2Element{0, 0, 0, 0}, + }, + } +} + +func (g *G2PointAffine) FromGnarkAffine(gnark *{{.PackageName}}.G2Affine) *G2PointAffine { + g.x.A0 = gnark.X.A0.Bits() + g.x.A1 = gnark.X.A1.Bits() + g.y.A0 = gnark.Y.A0.Bits() + g.y.A1 = gnark.Y.A1.Bits() + + return g +} + +func (g *G2PointAffine) FromGnarkJac(gnark *{{.PackageName}}.G2Jac) *G2PointAffine { + var pointAffine {{.PackageName}}.G2Affine + pointAffine.FromJacobian(gnark) + + g.x.A0 = pointAffine.X.A0.Bits() + g.x.A1 = pointAffine.X.A1.Bits() + g.y.A0 = pointAffine.Y.A0.Bits() + g.y.A1 = pointAffine.Y.A1.Bits() + + return g +} diff --git a/goicicle/templates/curves/imports.go.tmpl b/goicicle/templates/curves/imports.go.tmpl new file mode 100644 index 000000000..d04d0acea --- /dev/null +++ b/goicicle/templates/curves/imports.go.tmpl @@ -0,0 +1,34 @@ +{{ define "import_fr" }} + +{{ if eq .CurveName "BN254"}} + "github.com/consensys/gnark-crypto/ecc/bn254/fr" +{{ else if eq .CurveName "BLS12377"}} + "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" +{{ else if eq .CurveName "BLS12381"}} + "github.com/consensys/gnark-crypto/ecc/bls12-381/fr" +{{end}} + +{{end}} + +{{ define "import_fp" }} +{{ if eq .CurveName "BN254"}} + "github.com/consensys/gnark-crypto/ecc/bn254/fp" +{{ else if eq .CurveName "BLS12377"}} + "github.com/consensys/gnark-crypto/ecc/bls12-377/fp" +{{ else if eq .CurveName "BLS12381"}} + "github.com/consensys/gnark-crypto/ecc/bls12-381/fp" +{{end}} + +{{end}} + +{{ define "import_ecc" }} + +{{ if eq .CurveName "BN254"}} + "github.com/consensys/gnark-crypto/ecc/bn254" +{{ else if eq .CurveName "BLS12377"}} + "github.com/consensys/gnark-crypto/ecc/bls12-377" +{{ else if eq .CurveName "BLS12381"}} + "github.com/consensys/gnark-crypto/ecc/bls12-381" +{{end}} + +{{end}} diff --git a/goicicle/templates/hfiles/c_api.h.tmpl b/goicicle/templates/hfiles/c_api.h.tmpl new file mode 100644 index 000000000..e0a5ea7d2 --- /dev/null +++ b/goicicle/templates/hfiles/c_api.h.tmpl @@ -0,0 +1,15 @@ +#include +#include +// c_api.h + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct {{.CurveName}}_projective_t {{.CurveName}}_projective_t; + +bool eq_{{.Prefix}}({{.CurveName}}_projective_t *point1, {{.CurveName}}_projective_t *point2, size_t device_id); + +#ifdef __cplusplus +} +#endif diff --git a/goicicle/templates/hfiles/msm.h.tmpl b/goicicle/templates/hfiles/msm.h.tmpl new file mode 100644 index 000000000..50d01abab --- /dev/null +++ b/goicicle/templates/hfiles/msm.h.tmpl @@ -0,0 +1,35 @@ +#include +#include +// msm.h + +#ifndef _{{.CurveName}}_MSM_H +#define _{{.CurveName}}_MSM_H + +#ifdef __cplusplus +extern "C" { +#endif + +// Incomplete declaration of {{.CurveName}} projective and affine structs +typedef struct {{.CurveName}}_projective_t {{.CurveName}}_projective_t; +typedef struct {{.CurveName}}_affine_t {{.CurveName}}_affine_t; +typedef struct {{.CurveName}}_scalar_t {{.CurveName}}_scalar_t; + +int msm_cuda_{{.Prefix}}({{.CurveName}}_projective_t* out, {{.CurveName}}_affine_t* points, + {{.CurveName}}_scalar_t* scalars, size_t count, size_t device_id); + +int msm_batch_cuda_{{.Prefix}}({{.CurveName}}_projective_t* out, {{.CurveName}}_affine_t* points, + {{.CurveName}}_scalar_t* scalars, size_t batch_size, + size_t msm_size, size_t device_id); + +int commit_cuda_{{.Prefix}}({{.CurveName}}_projective_t* d_out, {{.CurveName}}_scalar_t* d_scalars, + {{.CurveName}}_affine_t* d_points, size_t count, size_t device_id); + +int commit_batch_cuda_{{.Prefix}}({{.CurveName}}_projective_t* d_out, {{.CurveName}}_scalar_t* d_scalars, + {{.CurveName}}_affine_t* d_points, size_t count, + size_t batch_size, size_t device_id); + +#ifdef __cplusplus +} +#endif + +#endif /* _{{.CurveName}}_MSM_H */ diff --git a/goicicle/templates/hfiles/ntt.h.tmpl b/goicicle/templates/hfiles/ntt.h.tmpl new file mode 100644 index 000000000..004d8cdf4 --- /dev/null +++ b/goicicle/templates/hfiles/ntt.h.tmpl @@ -0,0 +1,27 @@ +#include +#include +// ntt.h + +#ifndef _{{.CurveName}}_NTT_H +#define _{{.CurveName}}_NTT_H + +#ifdef __cplusplus +extern "C" { +#endif + +// Incomplete declaration of {{.CurveName}} projective and affine structs +typedef struct {{.CurveName}}_projective_t {{.CurveName}}_projective_t; +typedef struct {{.CurveName}}_affine_t {{.CurveName}}_affine_t; +typedef struct {{.CurveName}}_scalar_t {{.CurveName}}_scalar_t; + +int ntt_cuda_{{.Prefix}}({{.CurveName}}_scalar_t *arr, uint32_t n, bool inverse, size_t decimation, size_t device_id); +int ntt_batch_cuda_{{.Prefix}}({{.CurveName}}_scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id); + +int ecntt_cuda_{{.Prefix}}({{.CurveName}}_projective_t *arr, uint32_t n, bool inverse, size_t device_id); +int ecntt_batch_cuda_{{.Prefix}}({{.CurveName}}_projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id); + +#ifdef __cplusplus +} +#endif + +#endif /* _{{.CurveName}}_NTT_H */ diff --git a/goicicle/templates/hfiles/ve_mod_mult.h.tmpl b/goicicle/templates/hfiles/ve_mod_mult.h.tmpl new file mode 100644 index 000000000..01d1241f2 --- /dev/null +++ b/goicicle/templates/hfiles/ve_mod_mult.h.tmpl @@ -0,0 +1,24 @@ +#include +#include +// ve_mod_mult.h + +#ifndef _{{.CurveName}}_VEC_MULT_H +#define _{{.CurveName}}_VEC_MULT_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct {{.CurveName}}_projective_t {{.CurveName}}_projective_t; +typedef struct {{.CurveName}}_scalar_t {{.CurveName}}_scalar_t; + +int32_t vec_mod_mult_point_{{.Prefix}}({{.CurveName}}_projective_t *inout, {{.CurveName}}_scalar_t *scalar_vec, size_t n_elments, size_t device_id); +int32_t vec_mod_mult_scalar_{{.Prefix}}({{.CurveName}}_scalar_t *inout, {{.CurveName}}_scalar_t *scalar_vec, size_t n_elments, size_t device_id); +int32_t matrix_vec_mod_mult_{{.Prefix}}({{.CurveName}}_scalar_t *matrix_flattened, {{.CurveName}}_scalar_t *input, {{.CurveName}}_scalar_t *output, size_t n_elments, size_t device_id); + + +#ifdef __cplusplus +} +#endif + +#endif /* _{{.CurveName}}_VEC_MULT_H */ diff --git a/goicicle/templates/main.go b/goicicle/templates/main.go new file mode 100644 index 000000000..712f3e0a4 --- /dev/null +++ b/goicicle/templates/main.go @@ -0,0 +1,161 @@ +package main + +import ( + "fmt" + "os" + "path/filepath" + + "github.com/consensys/bavard" + config "github.com/ingonyama-zk/icicle/goicicle/templates/curves" +) + +const ( + copyrightHolder = "Ingonyama" + generatedBy = "Ingonyama" + copyrightYear = 2023 + baseDir = "../curves/" + hBaseDir = "../../icicle/curves/" +) + +var bgen = bavard.NewBatchGenerator(copyrightHolder, copyrightYear, generatedBy) + +func main() { + bn254_entries := []bavard.Entry{ + {File: filepath.Join(baseDir, "bn254", "g1.go"), Templates: []string{"g1.go.tmpl", "imports.go.tmpl"}}, + } + + bls12377_entries := []bavard.Entry{ + {File: filepath.Join(baseDir, "bls12377", "g1.go"), Templates: []string{"g1.go.tmpl", "imports.go.tmpl"}}, + } + + bls12381_entries := []bavard.Entry{ + {File: filepath.Join(baseDir, "bls12381", "g1.go"), Templates: []string{"g1.go.tmpl", "imports.go.tmpl"}}, + } + + assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./curves/", bls12377_entries...)) + assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./curves/", bn254_entries...)) + assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./curves/", bls12381_entries...)) + + bn254_g2_entries := []bavard.Entry{ + {File: filepath.Join(baseDir, "bn254", "g2.go"), Templates: []string{"g2.go.tmpl", "imports.go.tmpl"}}, + } + + bls12377_g2_entries := []bavard.Entry{ + {File: filepath.Join(baseDir, "bls12377", "g2.go"), Templates: []string{"g2.go.tmpl", "imports.go.tmpl"}}, + } + + bls12381_g2_entries := []bavard.Entry{ + {File: filepath.Join(baseDir, "bls12381", "g2.go"), Templates: []string{"g2.go.tmpl", "imports.go.tmpl"}}, + } + + assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./curves/", bls12377_g2_entries...)) + assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./curves/", bn254_g2_entries...)) + assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./curves/", bls12381_g2_entries...)) + + //bn254_msm_entries := []bavard.Entry{ + // {File: filepath.Join(baseDir, "bn254", "msm.go"), Templates: []string{"msm.go.tmpl", "../curves/imports.go.tmpl"}}, + //} + // + //bls12377_msm_entries := []bavard.Entry{ + // {File: filepath.Join(baseDir, "bls12377", "msm.go"), Templates: []string{"msm.go.tmpl", "../curves/imports.go.tmpl"}}, + //} + // + //bls12381_msm_entries := []bavard.Entry{ + // {File: filepath.Join(baseDir, "bls12381", "msm.go"), Templates: []string{"msm.go.tmpl", "../curves/imports.go.tmpl"}}, + //} + // + //assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./msm/", bls12377_msm_entries...)) + //assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./msm/", bn254_msm_entries...)) + //assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./msm/", bls12381_msm_entries...)) + // + //bn254_ntt_entries := []bavard.Entry{ + // {File: filepath.Join(baseDir, "bn254", "ntt.go"), Templates: []string{"ntt.go.tmpl", "../curves/imports.go.tmpl"}}, + //} + // + //bls12377_ntt_entries := []bavard.Entry{ + // {File: filepath.Join(baseDir, "bls12377", "ntt.go"), Templates: []string{"ntt.go.tmpl", "../curves/imports.go.tmpl"}}, + //} + // + //bls12381_ntt_entries := []bavard.Entry{ + // {File: filepath.Join(baseDir, "bls12381", "ntt.go"), Templates: []string{"ntt.go.tmpl", "../curves/imports.go.tmpl"}}, + //} + // + //assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./ntt/", bls12377_ntt_entries...)) + //assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./ntt/", bn254_ntt_entries...)) + //assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./ntt/", bls12381_ntt_entries...)) + + /* + h_files + */ + + //h_msm_bn254 := []bavard.Entry{ + // {File: filepath.Join(hBaseDir, "bn254", "msm.h"), Templates: []string{"msm.h.tmpl", "../curves/imports.go.tmpl"}}, + //} + // + //h_msm_bls12_377 := []bavard.Entry{ + // {File: filepath.Join(hBaseDir, "bls12_377", "msm.h"), Templates: []string{"msm.h.tmpl", "../curves/imports.go.tmpl"}}, + //} + // + //h_msm_bls12_381 := []bavard.Entry{ + // {File: filepath.Join(hBaseDir, "bls12_381", "msm.h"), Templates: []string{"msm.h.tmpl", "../curves/imports.go.tmpl"}}, + //} + // + //assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./hfiles/", h_msm_bls12_377...)) + //assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./hfiles/", h_msm_bn254...)) + //assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./hfiles/", h_msm_bls12_381...)) + // + //h_ntt_bn254 := []bavard.Entry{ + // {File: filepath.Join(hBaseDir, "bn254", "ntt.h"), Templates: []string{"ntt.h.tmpl", "../curves/imports.go.tmpl"}}, + //} + // + //h_ntt_bls12_377 := []bavard.Entry{ + // {File: filepath.Join(hBaseDir, "bls12_377", "ntt.h"), Templates: []string{"ntt.h.tmpl", "../curves/imports.go.tmpl"}}, + //} + // + //h_ntt_bls12_381 := []bavard.Entry{ + // {File: filepath.Join(hBaseDir, "bls12_381", "ntt.h"), Templates: []string{"ntt.h.tmpl", "../curves/imports.go.tmpl"}}, + //} + // + //assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./hfiles/", h_ntt_bls12_377...)) + //assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./hfiles/", h_ntt_bn254...)) + //assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./hfiles/", h_ntt_bls12_381...)) + // + //ve_mod_mult_h_bn254 := []bavard.Entry{ + // {File: filepath.Join(hBaseDir, "bn254", "ve_mod_mult.h"), Templates: []string{"ve_mod_mult.h.tmpl", "../curves/imports.go.tmpl"}}, + //} + // + //ve_mod_mult_h_bls12_377 := []bavard.Entry{ + // {File: filepath.Join(hBaseDir, "bls12_377", "ve_mod_mult.h"), Templates: []string{"ve_mod_mult.h.tmpl", "../curves/imports.go.tmpl"}}, + //} + // + //ve_mod_mult_ht_bls12_381 := []bavard.Entry{ + // {File: filepath.Join(hBaseDir, "bls12_381", "ve_mod_mult.h"), Templates: []string{"ve_mod_mult.h.tmpl", "../curves/imports.go.tmpl"}}, + //} + // + //assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./hfiles/", ve_mod_mult_h_bls12_377...)) + //assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./hfiles/", ve_mod_mult_h_bn254...)) + //assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./hfiles/", ve_mod_mult_ht_bls12_381...)) + // + //c_api_bn254 := []bavard.Entry{ + // {File: filepath.Join(hBaseDir, "bn254", "c_api.h"), Templates: []string{"c_api.h.tmpl", "../curves/imports.go.tmpl"}}, + //} + // + //c_api_bls12_377 := []bavard.Entry{ + // {File: filepath.Join(hBaseDir, "bls12_377", "c_api.h"), Templates: []string{"c_api.h.tmpl", "../curves/imports.go.tmpl"}}, + //} + // + //c_api_bls12_381 := []bavard.Entry{ + // {File: filepath.Join(hBaseDir, "bls12_381", "c_api.h"), Templates: []string{"c_api.h.tmpl", "../curves/imports.go.tmpl"}}, + //} + // + //assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./hfiles/", c_api_bls12_377...)) + //assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./hfiles/", c_api_bn254...)) + //assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./hfiles/", c_api_bls12_381...)) +} + +func assertNoError(err error) { + if err != nil { + fmt.Printf("\n%s\n", err.Error()) + os.Exit(-1) + } +} diff --git a/goicicle/templates/msm/msm.go.tmpl b/goicicle/templates/msm/msm.go.tmpl new file mode 100644 index 000000000..1840e789f --- /dev/null +++ b/goicicle/templates/msm/msm.go.tmpl @@ -0,0 +1,71 @@ +import ( + "errors" + "fmt" + "unsafe" +) + +// #cgo CFLAGS: -I../../../icicle/curves/{{toLower .CurveName}}/ +// #cgo LDFLAGS: -L${SRCDIR}/../../ {{.SharedLib}} +// #include "msm.h" +import "C" + +func Msm{{.CurveName}}(points []PointAffineNoInfinity{{.CurveName}}, scalars []ScalarField, device_id int) (*Point{{.CurveName}}, error) { + if len(points) != len(scalars) { + return nil, errors.New("error on: len(points) != len(scalars)") + } + + out := new(Point{{.CurveName}}) + + pointsC := (*C.{{.CurveName}}_affine_t)(unsafe.Pointer(&points[0])) + scalarsC := (*C.{{.CurveName}}_scalar_t)(unsafe.Pointer(&scalars[0])) + outC := (*C.{{.CurveName}}_projective_t)(unsafe.Pointer(out)) + + ret := C.msm_cuda_{{.Prefix}}(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id)) + + if ret != 0 { + return nil, fmt.Errorf("msm_cuda_{{.Prefix}} returned error code: %d", ret) + } + + return out, nil +} + +func MsmBatch{{.CurveName}}(points *[]PointAffineNoInfinity{{.CurveName}}, scalars *[]ScalarField, batchSize, deviceId int) ([]*Point{{.CurveName}}, error) { + // Check for nil pointers + if points == nil || scalars == nil { + return nil, errors.New("points or scalars is nil") + } + + if len(*points) != len(*scalars) { + return nil, errors.New("error on: len(points) != len(scalars)") + } + + // Check for empty slices + if len(*points) == 0 || len(*scalars) == 0 { + return nil, errors.New("points or scalars is empty") + } + + // Check for zero batchSize + if batchSize <= 0 { + return nil, errors.New("error on: batchSize must be greater than zero") + } + + out := make([]*Point{{.CurveName}}, batchSize) + + for i := 0; i < len(out); i++ { + out[i] = NewPoint{{.CurveName}}Zero() + } + + outC := (*C.{{.CurveName}}_projective_t)(unsafe.Pointer(&out[0])) + pointsC := (*C.{{.CurveName}}_affine_t)(unsafe.Pointer(&(*points)[0])) + scalarsC := (*C.{{.CurveName}}_scalar_t)(unsafe.Pointer(&(*scalars)[0])) + msmSizeC := C.size_t(len(*points) / batchSize) + deviceIdC := C.size_t(deviceId) + batchSizeC := C.size_t(batchSize) + + ret := C.msm_batch_cuda_{{.Prefix}}(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC) + if ret != 0 { + return nil, fmt.Errorf("msm_batch_cuda_{{.Prefix}} returned error code: %d", ret) + } + + return out, nil +} diff --git a/goicicle/templates/ntt/ntt.go.tmpl b/goicicle/templates/ntt/ntt.go.tmpl new file mode 100644 index 000000000..8896725f9 --- /dev/null +++ b/goicicle/templates/ntt/ntt.go.tmpl @@ -0,0 +1,54 @@ + +// #cgo CFLAGS: -I../../../icicle/curves//{{toLower .CurveName}}/ +// #cgo LDFLAGS: -L${SRCDIR}/../../ {{.SharedLib}} +// #include "ntt.h" +import "C" +import "unsafe" + +const ( + NONE = 0 + DIF = 1 + DIT = 2 +) + +func Ntt{{.CurveName}}(scalars *[]ScalarField, isInverse bool, decimation int, deviceId int) uint64 { + scalarsC := (*C.{{.CurveName}}_scalar_t)(unsafe.Pointer(&(*scalars)[0])) + + ret := C.ntt_cuda_{{.Prefix}}(scalarsC, C.uint32_t(len(*scalars)), C.bool(isInverse), C.size_t(decimation), C.size_t(deviceId)) + + return uint64(ret) +} + +func NttBatch{{.CurveName}}(scalars *[]ScalarField, isInverse bool, batchSize, deviceId int) uint64 { + scalarsC := (*C.{{.CurveName}}_scalar_t)(unsafe.Pointer(&(*scalars)[0])) + isInverseC := C.bool(isInverse) + batchSizeC := C.uint32_t(batchSize) + deviceIdC := C.size_t(deviceId) + + ret := C.ntt_batch_cuda_{{.Prefix}}(scalarsC, C.uint32_t(len(*scalars)), batchSizeC, isInverseC, deviceIdC) + + return uint64(ret) +} + +func EcNtt{{.CurveName}}(values *[]Point{{.CurveName}}, isInverse bool, deviceId int) uint64 { + valuesC := (*C.{{.CurveName}}_projective_t)(unsafe.Pointer(&(*values)[0])) + deviceIdC := C.size_t(deviceId) + isInverseC := C.bool(isInverse) + n := C.uint32_t(len(*values)) + + ret := C.ecntt_cuda_{{.Prefix}}(valuesC, n, isInverseC, deviceIdC) + + return uint64(ret) +} + +func EcNttBatch{{.CurveName}}(values *[]Point{{.CurveName}}, isInverse bool, batchSize, deviceId int) uint64 { + valuesC := (*C.{{.CurveName}}_projective_t)(unsafe.Pointer(&(*values)[0])) + deviceIdC := C.size_t(deviceId) + isInverseC := C.bool(isInverse) + n := C.uint32_t(len(*values)) + batchSizeC := C.uint32_t(batchSize) + + ret := C.ecntt_batch_cuda_{{.Prefix}}(valuesC, n, batchSizeC, isInverseC, deviceIdC) + + return uint64(ret) +} diff --git a/icicle/CMakeLists.txt b/icicle/CMakeLists.txt index 4836df87a..b4c3f345b 100644 --- a/icicle/CMakeLists.txt +++ b/icicle/CMakeLists.txt @@ -1,8 +1,8 @@ -cmake_minimum_required(VERSION 3.16) +cmake_minimum_required(VERSION 3.18) # GoogleTest requires at least C++14 set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CUDA_STANDARD 14) +set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) set(CMAKE_CXX_STANDARD_REQUIRED TRUE) # add the target cuda architectures @@ -22,6 +22,10 @@ FetchContent_Declare( URL https://github.com/google/googletest/archive/refs/tags/v1.13.0.zip ) # For Windows: Prevent overriding the parent project's compiler/linker settings + +# boosting lib +include_directories("/home/miner/include/boost_1_80_0") + set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) FetchContent_MakeAvailable(googletest) diff --git a/icicle/appUtils/msm/msm.cu b/icicle/appUtils/msm/msm.cu index 9bb6b4597..728f6a187 100644 --- a/icicle/appUtils/msm/msm.cu +++ b/icicle/appUtils/msm/msm.cu @@ -3,6 +3,7 @@ #pragma once #include #include +#include #include "../../primitives/affine.cuh" #include #include @@ -14,9 +15,385 @@ #include "../../primitives/field.cuh" #include "msm.cuh" +#define TEMP_NUM 10 +#define MAX_TH 256 -#define BIG_TRIANGLE +// #define SIGNED_DIG +// #define BIG_TRIANGLE +// #define ZPRIZE // #define SSM_SUM //WIP +// #define PHASE1_TEST + +#define SIZE 32 +#define SHMEM_SIZE 64 * 4 //why this size? + +// For last iteration (saves useless work) +// Use volatile to prevent caching in registers (compiler optimization) +// No __syncthreads() necessary! +template +__device__ void warpReduce(P* shmem_ptr, int t, int first, int last) { + for (int i=first; i>last; i>>=1){ + shmem_ptr[t] = shmem_ptr[t] + shmem_ptr[t + i]; + } +} + +template +__global__ void general_sum_reduction_kernel(P *v, P *v_r, unsigned nof_partial_sums, unsigned write_stride, unsigned write_phase) { + // Allocate shared memory + __shared__ P partial_sum[SHMEM_SIZE]; //use memory allocation like coop groups + // int partial_sum[]; + + // Calculate thread ID + // int tid = blockIdx.x * blockDim.x + threadIdx.x; + + // Load elements AND do first add of reduction + // Vector now 2x as long as number of threads, so scale i + int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; + + // Store first partial result instead of just the elements + partial_sum[threadIdx.x] = v[i] + v[i + blockDim.x]; + __syncthreads(); + + // Start at 1/2 block stride and divide by two each iteration + // Stop early (call device function instead) + for (int s = blockDim.x / 2; s > nof_partial_sums-1; s >>= 1) { + // Each thread does work unless it is further than the stride + if (threadIdx.x < s) { + partial_sum[threadIdx.x] = partial_sum[threadIdx.x] + partial_sum[threadIdx.x + s]; + } + __syncthreads(); + } + //todo - add device function + // if (threadIdx.x < 32) { + // warpReduce(partial_sum, threadIdx.x); + // } + + // Let the thread 0 for this block write it's result to main memory + // Result is inexed by this block + if (threadIdx.x < nof_partial_sums) { + unsigned write_ind = nof_partial_sums*blockIdx.x + threadIdx.x; + v_r[((write_ind/write_stride)*2 + write_phase)*write_stride + write_ind%write_stride] = partial_sum[threadIdx.x]; + } +} + +template +__global__ void single_stage_multi_reduction_kernel(P *v, P *v_r, unsigned block_size, unsigned write_stride, unsigned write_phase, unsigned padding) { + // Allocate shared memory + // __shared__ P partial_sum[SHMEM_SIZE]; //use memory allocation like coop groups + // int partial_sum[]; + + // Calculate thread ID + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int tid_p = padding? (tid/(2*padding))*padding + tid%padding: tid; + int jump =block_size/2; + int block_id = tid_p/jump; + int block_tid = tid_p%jump; + + // if (block_tid < jump){ + unsigned read_ind = block_size*block_id + block_tid; //fix + // unsigned padded_read_ind = block_size*block_id + block_tid; //fix + // unsigned write_ind = jump*block_id + block_tid; + unsigned write_ind = tid; + if (padding) printf(" %u %u %u %u\n",tid,tid_p,read_ind,((write_ind/write_stride)*2 + write_phase)*write_stride + write_ind%write_stride); + v_r[write_stride? ((write_ind/write_stride)*2 + write_phase)*write_stride + write_ind%write_stride : write_ind] = padding? (tid%(2*padding) +__global__ void variable_block_multi_reduction_kernel(P *v, P *v_r, unsigned *block_sizes, unsigned *block_offsets, unsigned write_stride, unsigned write_phase) { + // Allocate shared memory + // __shared__ P partial_sum[SHMEM_SIZE]; //use memory allocation like coop groups + // int partial_sum[]; + + // Calculate thread ID + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int jump =block_sizes[tid]/2; //???? + int block_offset = block_offsets[tid]; //block + int block_tid = tid - block_offset/2; //fix + + // if (block_tid < jump){ + unsigned read_ind = block_offset + block_tid; //fix + // unsigned padded_read_ind = block_size*block_id + block_tid; //fix + // unsigned write_ind = jump*block_id + block_tid; + unsigned write_ind = block_offset/2 + block_tid; + // if (padding) printf(" %u %u %u %u\n",tid,tid_p,read_ind,((write_ind/write_stride)*2 + write_phase)*write_stride + write_ind%write_stride); + v_r[write_stride? ((write_ind/write_stride)*2 + write_phase)*write_stride + write_ind%write_stride : write_ind] = v[read_ind] + v[read_ind + jump]; + // } +} + +template +__global__ void pad_buckets_kernel(P *v, P *v_r, unsigned block_size) { + // Allocate shared memory + // __shared__ P partial_sum[SHMEM_SIZE]; //use memory allocation like coop groups + // int partial_sum[]; + + // Calculate thread ID + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int dont_write = (tid/block_size)%2; + + v_r[tid] = dont_write? P::zero() : v[(tid/(block_size*2))*block_size + tid%block_size]; +} + + +template //todo-add SM and device function +__global__ void reduce_triangles_kernel(P *source_buckets,P* temp_buckets, P *target_buckets, const unsigned source_c, const unsigned source_nof_bms) { + // Allocate shared memory + // __shared__ int partial_sum[SHMEM_SIZE]; + + // Calculate thread ID + int tid = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned source_nof_buckets = source_nof_bms<>1;//4 + // const unsigned target_nof_buckets = target_nof_bms<>1;//2^7 + // unsigned nof_threads_per_bm = target_nof_bm_buckets>>1; + // if (tid >= source_nof_buckets>>1) return; //total threads + unsigned bm_index = tid/nof_threads_per_bm; //blockidx + unsigned bm_bucket_index = tid%nof_threads_per_bm; //threadidx + unsigned bucket_index = bm_index*source_nof_bm_buckets + bm_bucket_index; + + // if (tid ==0) printf("source_nof_buckets %u\n",source_nof_buckets); + // if (tid ==0) printf("source_nof_bm_buckets %u\n",source_nof_bm_buckets); + // if (tid ==0) printf("temp_nof_bm_buckets %u\n",temp_nof_bm_buckets); + // if (tid ==0) printf("target_nof_bms %u\n",target_nof_bms); + // if (tid ==0) printf("target_c %u\n",target_c); + // if (tid ==0) printf("target_nof_bm_buckets %u\n",target_nof_bm_buckets); + // if (tid ==0) printf("nof_threads_per_bm %u\n",nof_threads_per_bm); + // Load elements AND do first add of reduction + // Vector now 2x as long as number of threads, so scale i + // int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; + // __syncthreads(); + // if (tid ==0){ + // // printf("t\n"); + // // for (int i=0;i>1;j++) + // // {printf("%u ",temp_buckets[i*(source_nof_bm_buckets>>1)+j].x.x);} + // // printf("\n"); + // // } + // for (int i=0;i 32; s >>= 1) { + for (int s = nof_threads_per_bm/2; s > target_nof_bm_buckets/2; s >>= 1) { + // Each thread does work unless it is further than the stride + // temp_nof_bm_buckets = temp_nof_bm_buckets>>1; + // nof_threads_per_bm = temp_nof_bm_buckets>>1; + // bm_index = tid/nof_threads_per_bm; + // bm_bucket_index = tid%nof_threads_per_bm; + // bucket_index = bm_index*source_nof_bm_buckets + bm_bucket_index; + // if (tid ==9) printf("inds %u %u %u\n",bm_index,bm_bucket_index,bucket_index); + // if (tid < source_nof_bms*s) { + if (threadIdx.x < s) { + temp_buckets[bucket_index] = temp_buckets[bucket_index] + temp_buckets[bucket_index + s]; + } + __syncthreads(); + // if (tid ==0){ + // for (int i=0;i +__global__ void reduce_rectangles_kernel(P *source_buckets,P* temp_buckets, P *target_buckets, const unsigned source_c, const unsigned source_nof_bms) { + // Allocate shared memory + // __shared__ int partial_sum[SHMEM_SIZE]; + + // Calculate thread ID + int tid = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned source_nof_buckets = source_nof_bms<>1; + // const unsigned target_nof_buckets = target_nof_bms<>target_c; + unsigned temp_nof_segment_buckets = source_nof_segment_buckets; + unsigned target_nof_bm_buckets = 1<>1;//2^7 + unsigned nof_threads_per_segment = source_nof_segment_buckets>>1; //difference between kernels + // if (tid >= source_nof_buckets>>1) return; //total threads + unsigned bm_index = tid/nof_threads_per_bm; //blockidx + unsigned bm_bucket_index = tid%nof_threads_per_bm; //threadidx + unsigned segment_index = bm_bucket_index/nof_threads_per_segment; + unsigned segment_bucket_index = bm_bucket_index%nof_threads_per_segment; + unsigned bucket_index = bm_index*source_nof_bm_buckets + segment_index*source_nof_segment_buckets + segment_bucket_index; + + // Load elements AND do first add of reduction + // Vector now 2x as long as number of threads, so scale i + // int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; + + // Store first partial result instead of just the elements + // if (tid ==0){ + // printf("rtar\n"); + // for (int i=0;i 32; s >>= 1) { + for (int s = nof_threads_per_segment/2; s > 0; s >>= 1) { + // Each thread does work unless it is further than the stride + // temp_nof_segment_buckets = temp_nof_segment_buckets>>1; + // nof_threads_per_segment = temp_nof_segment_buckets>>1; + // segment_index = tid/nof_threads_per_segment; + // segment_bucket_index = tid%nof_threads_per_segment; + // bucket_index = segment_index*source_nof_segment_buckets + segment_bucket_index; + // if (tid < source_nof_bms*target_nof_bm_buckets*s) { //nof segments per bm + if (segment_bucket_index < s) { //nof segments per bm + temp_buckets[bucket_index] = temp_buckets[bucket_index] + temp_buckets[bucket_index + s]; + } + __syncthreads(); + // if (tid ==0){ + // for (int i=0;i>= 1) + result++; + return result; +} + +unsigned log2_ceiling(const unsigned value) { return value <= 1 ? 0 : log2_floor(value - 1) + 1; } + +unsigned get_optimal_log_data_split(const unsigned mpc, const unsigned source_window_bits, const unsigned target_window_bits, + const unsigned target_windows_count) { +#define MAX_THREADS 32 +#define MIN_BLOCKS 12 +const unsigned full_occupancy = mpc * MAX_THREADS * MIN_BLOCKS; +const unsigned target = full_occupancy << 6; +const unsigned unit_threads_count = target_windows_count << target_window_bits; +const unsigned split_target = log2_ceiling(target / unit_threads_count); +const unsigned split_limit = source_window_bits - target_window_bits - 1; +return std::min(split_target, split_limit); +} + +template +static constexpr __device__ __forceinline__ T ld_single(const T *ptr) { +return __ldg(ptr); +}; + +template +static constexpr __device__ __forceinline__ T ld(const T *address, const unsigned offset) { + static_assert(alignof(T) % alignof(U) == 0); + static_assert(sizeof(T) % sizeof(U) == 0); + constexpr size_t count = sizeof(T) / sizeof(U); + T result = {}; + auto pa = reinterpret_cast(address) + offset; + auto pr = reinterpret_cast(&result); +#pragma unroll + for (unsigned i = 0; i < count; i++) { + const auto pai = pa + i * STRIDE; + const auto pri = pr + i; + *pri = ld_single(pai); + } + return result; +} + +template > +static constexpr __device__ __forceinline__ T memory_load(const T *address, const unsigned offset = 0, [[maybe_unused]] uint4 _dummy = {}) { + return ld(address, offset); +}; + +template > +static constexpr __device__ __forceinline__ T memory_load(const T *address, const unsigned offset = 0, [[maybe_unused]] uint2 _dummy = {}) { + return ld(address, offset); +}; + +template > +static constexpr __device__ __forceinline__ T memory_load(const T *address, const unsigned offset = 0, [[maybe_unused]] unsigned _dummy = {}) { + return ld(address, offset); +}; //this kernel performs single scalar multiplication //each thread multilies a single scalar and point @@ -62,42 +439,157 @@ __global__ void initialize_buckets_kernel(P *buckets, unsigned N) { //this kernel splits the scalars into digits of size c //each thread splits a single scalar into nof_bms digits template -__global__ void split_scalars_kernel(unsigned *buckets_indices, unsigned *point_indices, S *scalars, unsigned total_size, unsigned msm_log_size, unsigned nof_bms, unsigned bm_bitsize, unsigned c){ +__global__ void split_scalars_kernel(unsigned *buckets_indices, unsigned *point_indices, S *scalars, unsigned total_size, unsigned msm_log_size, unsigned nof_bms, unsigned bm_bitsize, unsigned c, unsigned top_bm_nof_missing_bits){ + constexpr unsigned sign_mask = 0x80000000; + // constexpr unsigned trash_bucket = 0x80000000; unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; unsigned bucket_index; + unsigned bucket_index2; unsigned current_index; unsigned msm_index = tid >> msm_log_size; + unsigned borrow = 0; if (tid < total_size){ S scalar = scalars[tid]; + // A point = points[tid]; + // if (scalar == S::zero() || scalar == S::one()) return; + // if (scalar == S::zero()) return; + // if (tid == 0) printf("scalar %u", scalar); for (unsigned bm = 0; bm < nof_bms; bm++) { + // bucket_index = scalar.get_scalar_digit(bm, c) + (bm==nof_bms-1? ((tid&top_bm_nof_missing_bits)<<(c-top_bm_nof_missing_bits)) : 0); bucket_index = scalar.get_scalar_digit(bm, c); + #ifdef SIGNED_DIG + bucket_index += borrow; + borrow = 0; + unsigned sign = 0; + // if (tid == 0) printf("index %u", bucket_index); + if (bucket_index > (1<<(c-1))) { + bucket_index = (1 << c) - bucket_index; + borrow = 1; + sign = sign_mask; + } + #endif + // if (tid == 0) printf("new index %u", bucket_index); + // if (bm==nof_bms-1) { + // bucket_index2 = bucket_index + ((tid&((1< +__global__ void add_ones_kernel(A *points, S* scalars, P* results, const unsigned msm_size, const unsigned run_length){ + unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; + const unsigned nof_threads = (msm_size + run_length - 1)/run_length; + if (tid>=nof_threads) { + results[tid] = P::zero(); + return; + } + const unsigned start_index = tid*run_length; + P sum = P::zero(); + for (int i=start_index;i // __global__ void accumulate_buckets_kernel(P *__restrict__ buckets, unsigned *__restrict__ bucket_offsets, - // unsigned *__restrict__ bucket_sizes, unsigned *__restrict__ single_bucket_indices, unsigned *__restrict__ point_indices, A *__restrict__ points, unsigned nof_buckets, unsigned batch_size, unsigned msm_idx_shift){ -__global__ void accumulate_buckets_kernel(P *buckets, unsigned *bucket_offsets, unsigned *bucket_sizes, unsigned *single_bucket_indices, unsigned *point_indices, A *points, unsigned nof_buckets, unsigned *nof_buckets_to_compute, unsigned msm_idx_shift){ + // unsigned *__restrict__ bucket_sizes, unsigned *__restrict__ single_bucket_indices, unsigned *__restrict__ point_indices, A *__restrict__ points, unsigned nof_buckets, unsigned batch_size, unsigned msm_idx_shift){ +template +__global__ void accumulate_buckets_kernel(P *__restrict__ buckets, const unsigned *__restrict__ bucket_offsets, const unsigned *__restrict__ bucket_sizes, const unsigned *__restrict__ single_bucket_indices, const unsigned *__restrict__ point_indices, A *__restrict__ points, const unsigned nof_buckets, const unsigned *nof_buckets_to_compute, const unsigned msm_idx_shift, const unsigned c){ + constexpr unsigned sign_mask = 0x80000000; + // constexpr unsigned trash_bucket = 0x80000000; unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; - if (tid >= *nof_buckets_to_compute){ + // if (tid>=*nof_buckets_to_compute || tid<11){ + if (tid>=*nof_buckets_to_compute){ return; } + if ((single_bucket_indices[tid]&((1<>msm_idx_shift; + const unsigned bm_index = (single_bucket_indices[tid]&((1<>c; + const unsigned bucket_index = msm_index * nof_buckets + bm_index * ((1<<(c-1))+1) + (single_bucket_indices[tid]&((1<>msm_idx_shift; unsigned bucket_index = msm_index * nof_buckets + (single_bucket_indices[tid]&((1< 10) {printf(">10: %u %u %u\n",tid,single_bucket_indices[tid],single_bucket_indices[tid]&((1<=*nof_buckets_to_compute-10) printf("tid %u size %u\n", tid, bucket_sizes[tid]); + // if (tid==0) return; + // if ((bucket_index>>20)==13) return; + // if (bucket_sizes[tid]==16777216) printf("tid %u size %u bucket %u offset %u\n", tid, bucket_sizes[tid], bucket_index, bucket_offset); + // const unsigned *indexes = point_indices + bucket_offset; + // P bucket = P::zero(); //todo: get rid of init buckets? no.. because what about buckets with no points + P bucket; //todo: get rid of init buckets? no.. because what about buckets with no points + // unsigned point_ind; for (unsigned i = 0; i < bucket_sizes[tid]; i++) //add the relevant points starting from the relevant offset up to the bucket size { - buckets[bucket_index] = buckets[bucket_index] + points[point_indices[bucket_offset+i]]; + // unsigned point_ind = *indexes++; + // auto point = memory_load(points + point_ind); + // point_ind = point_indices[bucket_offset+i]; + // bucket = bucket + P::one(); + unsigned point_ind = point_indices[bucket_offset+i]; + #ifdef SIGNED_DIG + unsigned sign = point_ind & sign_mask; + point_ind &= ~sign_mask; + // printf("tid %u sign %u point ind %u \n", tid,sign, point_ind); + A point = points[point_ind]; + if (sign) point = A::neg(point); + #else + A point = points[point_ind]; + #endif + bucket = i? bucket + point : P::from_affine(point); + // const unsigned* pa = reinterpret_cast(points[point_ind]); + // P point; + // Dummy_Scalar scal; + // scal.x = __ldg(pa); + // point.x = scal; + // bucket = bucket + point; + } + // buckets[tid] = bucket; + buckets[bucket_index] = bucket; +} + +template +__global__ void accumulate_buckets_kernel2(P *buckets, A *points, S *scalars, const unsigned c,const unsigned nof_bms, const unsigned size){ + + unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tid>=size) return; + + S scalar = scalars[tid]; + A point = points[tid]; + unsigned bucket_index; + + for (unsigned bm = 0; bm < nof_bms; bm++) + { + // bucket_index = scalar.get_scalar_digit(bm, c) + (bm==nof_bms-1? ((tid&top_bm_nof_missing_bits)<<(c-top_bm_nof_missing_bits)) : 0); + bucket_index = scalar.get_scalar_digit(bm, c); + buckets[bucket_index] = buckets[bucket_index] + point; } + } //this kernel sums the entire bucket module @@ -106,16 +598,103 @@ template __global__ void big_triangle_sum_kernel(P* buckets, P* final_sums, unsigned nof_bms, unsigned c){ unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; - if (tid >= nof_bms) return; - P line_sum = buckets[(tid+1)*(1<=nof_bms) return; + #ifdef SIGNED_DIG + unsigned buckets_in_bm = (1<0; i--) + for (unsigned i = buckets_in_bm-2; i >0; i--) { - line_sum = line_sum + buckets[tid*(1< +__global__ void split_windows_kernel_inner(const unsigned source_window_bits_count, const unsigned source_windows_count, + const P *__restrict__ source_buckets, P *__restrict__ target_buckets, const unsigned count) { +const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x; +if (gid >= count) //0,1,2,2^8,2^8+1,2^8+2,32*2^8,32*2^8+1,32*2^8+2^8,32*2^8+2^8+1 +return; +const unsigned target_window_bits_count = (source_window_bits_count + 1) >> 1; //8 +const unsigned target_windows_count = source_windows_count << 1; //32 +const unsigned target_partition_buckets_count = target_windows_count << target_window_bits_count; // 32*2^8 +const unsigned target_partitions_count = count / target_partition_buckets_count; //2^7 +const unsigned target_partition_index = gid / target_partition_buckets_count; //*0,0,0,0,0,0,1,1,1,1 +const unsigned target_partition_tid = gid % target_partition_buckets_count; //*0,1,2,2^8,2^8+1,2^8+2,0,1,2^8,2^8+1 +const unsigned target_window_buckets_count = 1 << target_window_bits_count; // 2^8 +const unsigned target_window_index = target_partition_tid / target_window_buckets_count; //* 0,0.0,1,1,1,0,0,1,1 +const unsigned target_window_tid = target_partition_tid % target_window_buckets_count; //* 0,1,2,0,1,2,0,1,0,1,2 +const unsigned split_index = target_window_index & 1; //*0,0,0,1,1,1,0,0,1,1,1 +const unsigned source_window_buckets_per_target = source_window_bits_count & 1 // is c odd? +? split_index ? (target_window_tid >> (target_window_bits_count - 1) ? 0 : target_window_buckets_count) //is the target odd? + : 1 << (source_window_bits_count - target_window_bits_count) +: target_window_buckets_count; //2^8 +const unsigned source_window_index = target_window_index >> 1; //*0,0,0,0,0,0,0,0,0,0,0 +const unsigned source_offset = source_window_index << source_window_bits_count; //*0,0,0,0,0,0,0,0,0,0, +const unsigned target_shift = target_window_bits_count * split_index; //*0,0,0,8,8,8,0,0,8,8,8 +const unsigned target_offset = target_window_tid << target_shift;//*0,1,2,0,2^8,2^9,0,1,0,2^8,2*2^8 +const unsigned global_offset = source_offset + target_offset;//*0,1,2,0,2^8,2^9,0,1 +const unsigned index_mask = (1 << target_shift) - 1; //*0,0,0,2^8-1,2^8-1,2^8-1,0,0,2^8-1,2^8-1 +P target_bucket = P::zero(); +#pragma unroll 1 +for (unsigned i = target_partition_index; i < source_window_buckets_per_target; i += target_partitions_count) { //from the partition start(*0,0,0,0,0,0,1,1,1,1), stride 2^7, until 2^8 = loop twice +const unsigned index_offset = i & index_mask | (i & ~index_mask) << target_window_bits_count; //*0 2^15,0 2^15,0 2^15,0 2^15,0 2^15,0 2^15,2^8 2^8+2^15,2^8 2^8+2^15,2^8 2^8+2^15,2^8 2^8+2^15 +const unsigned load_offset = global_offset + index_offset;//*0 2^15,1 2^15+1,2 2^15+2, 0 2^15, 2^8 2^8+2^15, 2^8 2^8+2^15, 2^8+1 2^8+2^15+1 +const auto source_bucket = source_buckets[load_offset]; +target_bucket = i == target_partition_index ? source_bucket : target_bucket + source_bucket; //*0+2^15,1+2^15+1,2+2^15+2,...2^8-1+2^15+2^8-1| 0+2^7, 2^8+2^8+2^7...||2^8+2^8+2^15, 2^8+1+2^8+2^15+1...2^9-1+2^9-1+2^15|1+2^7+1, 2^8+1+2^8+2^7+1... +} +target_buckets[gid] = target_bucket; //0,1,2^8,2^8+1,32*2^8,32*2^8+1 +} + +template +__global__ void reduce_buckets_kernel(P *buckets, const unsigned count) { + const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x; + if (gid >= count) + return; + // buckets += gid; + const auto a = buckets[gid]; + const auto b = buckets[gid+count]; + const P result = a+b; + buckets[gid] = result; +} + +template +__global__ void reduce_buckets_kernel2(P *source, P *target, const unsigned count) { + const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x; + if (gid >= count) + return; + const auto a = source[gid]; + const auto b = source[gid+count]; + const P result = a+b; + target[gid] = result; +} + +template +__global__ void last_pass_gather_kernel(const unsigned bits_count_pass_one, const P *__restrict__ source, P *__restrict__ target, + const unsigned count) { +const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x; +if (gid >= count) +return; +unsigned window_index = gid / bits_count_pass_one; +unsigned window_tid = gid % bits_count_pass_one; +for (unsigned bits_count = bits_count_pass_one; bits_count > 1;) { +bits_count = (bits_count + 1) >> 1; +window_index <<= 1; +if (window_tid >= bits_count) { +window_index++; +window_tid -= bits_count; +} +} +const unsigned sid = (window_index << 1) + 1; +const auto pz = source[sid]; +// const point_jacobian pj = point_xyzz::to_jacobian(pz, f); +target[gid] = pz; +} + //this kernel uses single scalar multiplication to multiply each bucket by its index //each thread deals with a single bucket template @@ -130,10 +709,17 @@ __global__ void ssm_buckets_kernel(P* buckets, unsigned* single_bucket_indices, } +template +__global__ void last_pass_kernel(P*final_buckets, P*final_sums, unsigned num_sums){ + unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tid>num_sums) return; + final_sums[tid] = final_buckets[2*tid+1]; +} + //this kernel computes the final result using the double and add algorithm //it is done by a single thread template -__global__ void final_accumulation_kernel(P* final_sums, P* final_results, unsigned nof_msms, unsigned nof_bms, unsigned c){ +__global__ void final_accumulation_kernel(P* final_sums, P* ones_result, P* final_results, unsigned nof_msms, unsigned nof_bms, unsigned c){ unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; if (tid>nof_msms) return; @@ -146,14 +732,168 @@ __global__ void final_accumulation_kernel(P* final_sums, P* final_results, unsig final_result = final_result + final_result; } } - final_results[tid] = final_result + final_sums[tid*nof_bms]; + final_results[tid] = final_result + final_sums[tid*nof_bms] + ones_result[0]; + // final_results[tid] = final_result + final_sums[tid*nof_bms]; + +} + +template +void test_reduce_triangle(P* h_buckets){ + for (int i=0; i>>(buckets,temp,target,4,4); + // reduce_triangles_kernel<<<5,8>>>(buckets,temp,target,4,4); + general_sum_reduction_kernel<<<5,8>>>(buckets,target,4,4,0); + cudaDeviceSynchronize(); + printf("cuda error %u\n",cudaGetLastError()); + + std::vector

h_target; + h_target.reserve(TEMP_NUM); + cudaMemcpy(h_target.data(), target, sizeof(P) * TEMP_NUM, cudaMemcpyDeviceToHost); + std::cout< h_buckets; + // h_buckets.reserve(nof_buckets); + // cudaMemcpy(h_buckets.data(), buckets, sizeof(P) * nof_buckets, cudaMemcpyDeviceToHost); + // std::cout<<"buckets accumulated"< +void test_reduce_var(P* h_buckets){ + for (int i=0; i>>(buckets,temp,target,4,4); + // single_stage_multi_reduction_kernel<<<1,64>>>(buckets,target,16,8,0); + unsigned h_sizes[10] = {4,4,4,4}; + unsigned h_offsets[10] = {2,2,6,6}; + unsigned *sizes; + unsigned *offsets; + cudaMalloc(&sizes, sizeof(unsigned) * count); + cudaMalloc(&offsets, sizeof(unsigned) * count); + cudaMemcpy(sizes, h_sizes, sizeof(unsigned) * count, cudaMemcpyHostToDevice); + cudaMemcpy(offsets, h_offsets, sizeof(unsigned) * count, cudaMemcpyHostToDevice); + variable_block_multi_reduction_kernel<<<1,4>>>(buckets,target,sizes,offsets,0,0); + + cudaDeviceSynchronize(); + printf("cuda error %u\n",cudaGetLastError()); + std::vector

h_target; + h_target.reserve(TEMP_NUM); + cudaMemcpy(h_target.data(), target, sizeof(P) * TEMP_NUM, cudaMemcpyDeviceToHost); + std::cout< +void test_reduce_single(P* h_buckets){ + for (int i=0; i>>(buckets,temp,target,4,4); + // single_stage_multi_reduction_kernel<<<1,64>>>(buckets,target,16,8,0); + single_stage_multi_reduction_kernel<<<2,32>>>(buckets,target,2,0,0); + + cudaDeviceSynchronize(); + printf("cuda error %u\n",cudaGetLastError()); + std::vector

h_target; + h_target.reserve(TEMP_NUM); + cudaMemcpy(h_target.data(), target, sizeof(P) * TEMP_NUM, cudaMemcpyDeviceToHost); + std::cout< +void test_reduce_rectangle(P* h_buckets){ + for (int i=0; i>>(buckets,temp,target,4,4); + general_sum_reduction_kernel<<<20,2>>>(buckets,target,1,4,1); + + cudaDeviceSynchronize(); + printf("cuda error %u\n",cudaGetLastError()); + std::vector

h_target; + h_target.reserve(TEMP_NUM); + cudaMemcpy(h_target.data(), target, sizeof(P) * TEMP_NUM, cudaMemcpyDeviceToHost); + std::cout< -void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device, cudaStream_t stream) { +void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device, bool big_triangle, cudaStream_t stream) { + // std::cout<<"points"<>>(buckets, nof_buckets); + // cudaDeviceSynchronize(); +// printf("cuda error %u\n",cudaGetLastError()); + + //accumulate ones + P *ones_results; //fix whole division, in last run in kernel too + const unsigned nof_runs = max(1<<(msm_log_size-6), 16); + const unsigned run_length = (size + nof_runs -1)/nof_runs; + cudaMallocAsync(&ones_results, sizeof(P) * nof_runs, stream); + NUM_THREADS = min(1 << 8,nof_runs); + NUM_BLOCKS = (nof_runs + NUM_THREADS - 1) / NUM_THREADS; + add_ones_kernel<<>>(d_points, d_scalars, ones_results, size, run_length); + // cudaDeviceSynchronize(); +// printf("cuda error ones %u\n",cudaGetLastError()); + + // cudaDeviceSynchronize(); + // std::vector

h_ones_results; + // h_ones_results.reserve(nof_runs); + // cudaMemcpy(h_ones_results.data(), ones_results, sizeof(P) * nof_runs, cudaMemcpyDeviceToHost); + // std::cout<<"one results"<>1;s>0;s>>=1){ + NUM_THREADS = min(MAX_TH,s); + NUM_BLOCKS = (s + NUM_THREADS - 1) / NUM_THREADS; + single_stage_multi_reduction_kernel<<>>(ones_results,ones_results,s*2,0,0,0); + // cudaDeviceSynchronize(); + // printf("cuda error ones %u\n",cudaGetLastError()); + // cudaDeviceSynchronize(); + // cudaMemcpy(h_ones_results.data(), ones_results, sizeof(P) * nof_runs, cudaMemcpyDeviceToHost); + // std::cout<<"one results"<>>(bucket_indices + size, point_indices + size, d_scalars, size, msm_log_size, - nof_bms, bm_bitsize, c); //+size - leaving the first bm free for the out of place sort later - + nof_bms, bm_bitsize, c, top_bm_nof_missing_bits); //+size - leaving the first bm free for the out of place sort later + // cudaDeviceSynchronize(); + // printf("cuda error %u\n",cudaGetLastError()); + + + // cudaDeviceSynchronize(); + // std::vector h_bucket_ind; + // std::vector h_point_ind; + // h_bucket_ind.reserve(size * (nof_bms+1)); + // h_point_ind.reserve(size * (nof_bms+1)); + // cudaMemcpy(h_bucket_ind.data(), bucket_indices, sizeof(unsigned) * size * (nof_bms+1), cudaMemcpyDeviceToHost); + // cudaMemcpy(h_point_ind.data(), point_indices, sizeof(unsigned) * size * (nof_bms+1), cudaMemcpyDeviceToHost); + // std::cout<>>(buckets, bucket_offsets, bucket_sizes, single_bucket_indices, point_indices, - d_points, nof_buckets, nof_buckets_to_compute, c+bm_bitsize); + d_points, nof_buckets, nof_buckets_to_compute, c+bm_bitsize, c); +// accumulate_buckets_kernel<<>>(buckets, sorted_bucket_offsets, sorted_bucket_sizes, sorted_single_bucket_indices, point_indices, +// d_points, nof_buckets, nof_buckets_to_compute, c+bm_bitsize, c); + // accumulate_buckets_kernel<<>>(buckets, sorted_bucket_offsets, sorted_bucket_sizes, sorted_single_bucket_indices, point_indices, + // d_points, nof_buckets, nof_buckets_to_compute, c-1+bm_bitsize); + // cudaDeviceSynchronize(); + // printf("cuda error acc %u\n",cudaGetLastError()); +#else +NUM_THREADS = 1 << 8; +// NUM_THREADS = 1 << 5; +NUM_BLOCKS = (size + NUM_THREADS - 1) / NUM_THREADS; +accumulate_buckets_kernel2<<>>(buckets, points, scalars, c, nof_bms, size); +// cudaDeviceSynchronize(); +// printf("cuda error 111%u\n",cudaGetLastError()); +#endif +//reduce top bm +// NUM_THREADS = min(MAX_TH,(source_buckets_count>>(1+j))); +// printf("NUM_THREADS 1 %u \n" ,NUM_THREADS); +// NUM_BLOCKS = ((source_buckets_count>>(1+j)) + NUM_THREADS - 1) / NUM_THREADS; +// printf("NUM_BLOCKS 1 %u \n" ,NUM_BLOCKS); +// single_stage_multi_reduction_kernel<<>>(j==0?source_buckets:temp_buckets1,j==target_bits_count-1? target_buckets: temp_buckets1,1<<(source_bits_count-j),j==target_bits_count-1? 1<>>(buckets, final_results); #endif - #ifdef BIG_TRIANGLE - P* final_results; + P* final_results; + if (big_triangle){ cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream); //launch the bucket module sum kernel - a thread for each bucket module NUM_THREADS = nof_bms; NUM_BLOCKS = 1; - big_triangle_sum_kernel<<>>(buckets, final_results, nof_bms, c); - #endif + #ifdef SIGNED_DIG + big_triangle_sum_kernel<<>>(buckets, final_results, nof_bms, c-1); //sighed digits + #else + big_triangle_sum_kernel<<>>(buckets, final_results, nof_bms, c); + #endif + // cudaDeviceSynchronize(); + // printf("cuda error %u\n",cudaGetLastError()); + } + #ifdef ZPRIZE + else{ + + unsigned source_bits_count = c; + unsigned source_windows_count = nof_bms; + P *source_buckets = buckets; + buckets = nullptr; + P *target_buckets; + for (unsigned i = 0;; i++) { + const unsigned target_bits_count = (source_bits_count + 1) >> 1; //c/2=8 + const unsigned target_windows_count = source_windows_count << 1; //nof bms*2 = 32 + const unsigned target_buckets_count = target_windows_count << target_bits_count; // bms*2^c = 32*2^8 + const unsigned log_data_split = + get_optimal_log_data_split(84, source_bits_count, target_bits_count, target_windows_count); //todo - get num of multiprossecors + const unsigned total_buckets_count = target_buckets_count << log_data_split; //32*2^8*2^7 + cudaMallocAsync(&target_buckets, sizeof(P) * total_buckets_count, stream); //32*2^8*2^7 buckets + NUM_THREADS = 32; + NUM_BLOCKS = (total_buckets_count + NUM_THREADS - 1) / NUM_THREADS; + // const unsigned block_dim = total_buckets_count < 32 ? total_buckets_count : 32; + // const unsigned grid_dim = (total_buckets_count - 1) / block_dim.x + 1; + split_windows_kernel_inner<<>>(source_bits_count, source_windows_count, source_buckets, target_buckets, total_buckets_count); + // cudaDeviceSynchronize(); + // printf("cuda error %u\n",cudaGetLastError()); + cudaFreeAsync(source_buckets, stream); + + for (unsigned j = 0; j < log_data_split; j++){ + const unsigned count = total_buckets_count >> (j + 1); + // const unsigned block_dim = count < 32 ? count : 32; + // const unsigned grid_dim = (count - 1) / block_dim.x + 1; + NUM_THREADS = 32; + NUM_BLOCKS = (count + NUM_THREADS - 1) / NUM_THREADS; + reduce_buckets_kernel<<>>(target_buckets, count); + // cudaDeviceSynchronize(); + // printf("cuda error %u\n",cudaGetLastError()); + } + if (target_bits_count == 1) { + // P results; + // // const unsigned result_windows_count = min(fd_q::MBC, windows_count_pass_one * bits_count_pass_one); + const unsigned result_windows_count = bitsize; + // if (copy_results) + // HANDLE_CUDA_ERROR(allocate(results, result_windows_count, pool, stream)); + // HANDLE_CUDA_ERROR(last_pass_gather(bits_count_pass_one, target_buckets, copy_results ? results : ec.results, result_windows_count, stream)); + // if (copy_results) { + // HANDLE_CUDA_ERROR(cudaMemcpyAsync(ec.results, results, sizeof(point_jacobian) * result_windows_count, cudaMemcpyDeviceToHost, stream)); + // if (ec.d2h_copy_finished) + // HANDLE_CUDA_ERROR(cudaEventRecord(ec.d2h_copy_finished, stream)); + // if (ec.d2h_copy_finished_callback) + // HANDLE_CUDA_ERROR(cudaLaunchHostFunc(stream, ec.d2h_copy_finished_callback, ec.d2h_copy_finished_callback_data)); + // } + // if (copy_results) + // HANDLE_CUDA_ERROR(free(results, stream)); + // HANDLE_CUDA_ERROR(free(target_buckets, stream)); + nof_bms = bitsize; + cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream); + NUM_THREADS = 32; + NUM_BLOCKS = (result_windows_count + NUM_THREADS - 1) / NUM_THREADS; + // const dim3 block_dim = result_windows_count < 32 ? count : 32; + // const dim3 grid_dim = (result_windows_count - 1) / block_dim.x + 1; + last_pass_gather_kernel<<>>(c, target_buckets, final_results, result_windows_count); + // cudaDeviceSynchronize(); + // printf("cuda error %u\n",cudaGetLastError()); + c = 1; + break; + } + source_buckets = target_buckets; + target_buckets = nullptr; + source_bits_count = target_bits_count; + source_windows_count = target_windows_count; + } +} +#else +else{ + // cudaDeviceSynchronize(); +// printf("cuda erddsdfsdfsror %u\n",cudaGetLastError()); +// cudaDeviceSynchronize(); +// std::vector

h_buckets; +// h_buckets.reserve(nof_buckets); +// cudaMemcpy(h_buckets.data(), buckets, sizeof(P) * nof_buckets, cudaMemcpyDeviceToHost); +// std::cout<<"buckets accumulated"<>>(source_buckets,source_buckets_padded,1< s_buckets; + // s_buckets.reserve(source_buckets_count); + // cudaMemcpy(s_buckets.data(), source_buckets_padded, sizeof(P) * source_buckets_count, cudaMemcpyDeviceToHost); + // std::cout<<"source_buckets_padded"<> 1; //c/2=8 + // printf("target_bits_count %u \n" ,target_bits_count); + const unsigned target_windows_count = source_windows_count << 1; //nof bms*2 = 32 + // const unsigned target_window_buckets_count = 1 << target_bits_count; // 2^8 + const unsigned target_buckets_count = target_windows_count << target_bits_count; // bms*2^c = 32*2^8 + // const unsigned log_data_split = + // get_optimal_log_data_split(84, source_bits_count, target_bits_count, target_windows_count); //todo - get num of multiprossecors + // const unsigned total_buckets_count = target_buckets_count << log_data_split; //32*2^8*2^7 + cudaMallocAsync(&target_buckets, sizeof(P) * target_buckets_count,stream); //32*2^8*2^7 buckets + cudaMallocAsync(&temp_buckets1, sizeof(P) * source_buckets_count/2,stream); //32*2^8*2^7 buckets + cudaMallocAsync(&temp_buckets2, sizeof(P) * source_buckets_count/2,stream); //32*2^8*2^7 buckets + // const unsigned block_dim = total_buckets_count < 32 ? total_buckets_count : 32; + // const unsigned grid_dim = (total_buckets_count - 1) / block_dim.x + 1; + //input output, streams + // reduce_buckets_kernel<<>>(source_buckets, target_buckets, source_windows_count>>1); + // for (unsigned j = 0; j < target_windows_count-1; j++) //another loop + // reduce_buckets_kernel<<>>(target_buckets, target_buckets, source_windows_count>>(j+2)); + + // cudaStream_t stream2; + // cudaStreamCreate(&streams[0]); + // cudaStreamCreate(&streams[1]); + // cudaStreamCreate(&stream2); + + // if (source_bits_count>8){ + if (source_bits_count>0){ + for(unsigned j=0;j t1_buckets; + // std::vector

t2_buckets; + // t1_buckets.reserve(source_buckets_count); + // t2_buckets.reserve(source_buckets_count); + // cudaMemcpy(t1_buckets.data(), temp_buckets1, sizeof(P) * source_buckets_count, cudaMemcpyDeviceToHost); + // cudaMemcpy(t2_buckets.data(), temp_buckets2, sizeof(P) * source_buckets_count, cudaMemcpyDeviceToHost); + // std::cout<<"0 buckets temp1"<>(1+j))); + // printf("NUM_THREADS 1 %u \n" ,NUM_THREADS); + NUM_BLOCKS = ((source_buckets_count>>(1+j)) + NUM_THREADS - 1) / NUM_THREADS; + // printf("NUM_BLOCKS 1 %u \n" ,NUM_BLOCKS); + single_stage_multi_reduction_kernel<<>>(j==0?source_buckets:temp_buckets1,j==target_bits_count-1? target_buckets: temp_buckets1,1<<(source_bits_count-j),j==target_bits_count-1? 1< t1_buckets; + // std::vector

t2_buckets; + // t1_buckets.reserve(source_buckets_count/2); + // t2_buckets.reserve(source_buckets_count/2); + // cudaMemcpy(t1_buckets.data(), temp_buckets1, sizeof(P) * source_buckets_count, cudaMemcpyDeviceToHost); + // cudaMemcpy(t2_buckets.data(), temp_buckets2, sizeof(P) * source_buckets_count, cudaMemcpyDeviceToHost); + // std::cout<<"1 buckets temp1"<>(1+j))*((odd_source_c&&j==target_bits_count-1)? 2 :1); + unsigned nof_threads = (source_buckets_count>>(1+j)); + NUM_THREADS = min(MAX_TH,nof_threads); + // printf("NUM_THREADS 2 %u \n" ,NUM_THREADS); + NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS; + // printf("NUM_BLOCKS 2 %u \n" ,NUM_BLOCKS); + single_stage_multi_reduction_kernel<<>>(j==0?source_buckets:temp_buckets2,j==target_bits_count-1? target_buckets: temp_buckets2,1<<(target_bits_count-j),j==target_bits_count-1? 1< t1_buckets; + // std::vector

t2_buckets; + // t1_buckets.reserve(source_buckets_count/2); + // t2_buckets.reserve(source_buckets_count/2); + // cudaMemcpy(t1_buckets.data(), temp_buckets1, sizeof(P) * source_buckets_count, cudaMemcpyDeviceToHost); + // cudaMemcpy(t2_buckets.data(), temp_buckets2, sizeof(P) * source_buckets_count, cudaMemcpyDeviceToHost); + // std::cout<<"2 buckets temp1"< t1_buckets; + // std::vector

t2_buckets; + // t1_buckets.reserve(source_buckets_count/2); + // t2_buckets.reserve(source_buckets_count/2); + // cudaMemcpy(t1_buckets.data(), target_buckets, sizeof(P) * target_buckets_count, cudaMemcpyDeviceToHost); + // std::cout<<"2 buckets target"<>1) + NUM_THREADS - 1) / NUM_THREADS; + // NUM_THREADS = 1<<(source_bits_count-1); + // printf("NUM_THREADS %u \n" ,NUM_THREADS); + // NUM_BLOCKS = source_windows_count; + // printf("NUM_BLOCKS %u \n" ,NUM_BLOCKS); + // reduce_triangles_kernel<<>>(source_buckets,temp_buckets1,target_buckets,source_bits_count,source_windows_count); + // for(unsigned j=0;;j++){ + NUM_THREADS = 1<<(source_bits_count-1); + // printf("NUM_THREADS 1 %u \n" ,NUM_THREADS); + NUM_BLOCKS = source_windows_count; + // printf("NUM_BLOCKS 1 %u \n" ,NUM_BLOCKS); + general_sum_reduction_kernel<<>>(source_buckets,target_buckets,1< t_buckets; + // t_buckets.reserve(target_buckets_count); + // cudaMemcpy(t_buckets.data(), target_buckets, sizeof(P) * target_buckets_count, cudaMemcpyDeviceToHost); + // std::cout<<"buckets target1"<>>(source_buckets,temp_buckets2,target_buckets,source_bits_count,source_windows_count); + NUM_THREADS = 1<<(target_bits_count-1); + // printf("NUM_THREADS 2 %u \n" ,NUM_THREADS); + NUM_BLOCKS = source_windows_count<>>(source_buckets,target_buckets,1,1< t_buckets; + // // t_buckets.reserve(target_buckets_count); + // cudaMemcpy(t_buckets.data(), target_buckets, sizeof(P) * target_buckets_count, cudaMemcpyDeviceToHost); + // std::cout<<"buckets target2"<>1) + NUM_THREADS - 1) / NUM_THREADS; //2^15 + // reduce_buckets_kernel2<<>>(source_buckets+source_offset, temp_buckets1+source_offset, source_window_buckets_count>>1); //same source different target + // for (unsigned k = 0; k < target_bits_count-2; k++){ //0..5 + // NUM_BLOCKS = ((source_window_buckets_count>>(k+2)) + NUM_THREADS - 1) / NUM_THREADS;//2^14..2^9 + // reduce_buckets_kernel2<<>>(temp_buckets1+source_offset, temp_buckets1+source_offset, source_window_buckets_count>>(k+2)); //stream j + // } + // NUM_BLOCKS = ((source_window_buckets_count>>target_bits_count) + NUM_THREADS - 1) / NUM_THREADS;//2^8 + // reduce_buckets_kernel2<<>>(temp_buckets1+source_offset, target_buckets+target_offset, source_window_buckets_count>>target_bits_count); //stream j + // } + + // for (unsigned j = 0; j < source_windows_count*target_window_buckets_count; j++){ //loop on every segment of every source bm // 0..16*2^8-1 + // unsigned source_offset = j*target_window_buckets_count; + // unsigned target_offset = j%target_window_buckets_count+(j/target_window_buckets_count)*target_window_buckets_count*2 + target_window_buckets_count; + // NUM_BLOCKS = ((target_window_buckets_count>>1) + NUM_THREADS - 1) / NUM_THREADS; //2^7 + // reduce_buckets_kernel2<<>>(source_buckets+source_offset, temp_buckets2+source_offset, target_window_buckets_count>>1); //same source different target + // for (unsigned k = 0; k < target_bits_count-2; k++){ //0..5 + // NUM_BLOCKS = ((target_window_buckets_count>>(k+2)) + NUM_THREADS - 1) / NUM_THREADS; //last blocks are single threaded.. //2^6..2^1 + // reduce_buckets_kernel2<<>>(temp_buckets2+source_offset, temp_buckets2+source_offset, target_window_buckets_count>>(k+2));// stream j + source_windows_count + // } + // NUM_BLOCKS = 1; //last blocks are single threaded.. // + // reduce_buckets_kernel2<<>>(temp_buckets2+source_offset, target_buckets+target_offset, 1);// stream j + source_windows_count + // } + + // for (int k = 0; k < NUM_STREAMS; ++k) + // { + // cudaStreamSynchronize(streams[k]); + // cudaStreamDestroy(streams[k]); + // } + + // cudaFreeAsync(source_buckets, stream); + if (target_bits_count == 1) { + // P results; + // // const unsigned result_windows_count = min(fd_q::MBC, windows_count_pass_one * bits_count_pass_one); + // const unsigned result_windows_count = bitsize; + // if (copy_results) + // HANDLE_CUDA_ERROR(allocate(results, result_windows_count, pool, stream)); + // HANDLE_CUDA_ERROR(last_pass_gather(bits_count_pass_one, target_buckets, copy_results ? results : ec.results, result_windows_count, stream)); + // if (copy_results) { + // HANDLE_CUDA_ERROR(cudaMemcpyAsync(ec.results, results, sizeof(point_jacobian) * result_windows_count, cudaMemcpyDeviceToHost, stream)); + // if (ec.d2h_copy_finished) + // HANDLE_CUDA_ERROR(cudaEventRecord(ec.d2h_copy_finished, stream)); + // if (ec.d2h_copy_finished_callback) + // HANDLE_CUDA_ERROR(cudaLaunchHostFunc(stream, ec.d2h_copy_finished_callback, ec.d2h_copy_finished_callback_data)); + // } + // if (copy_results) + // HANDLE_CUDA_ERROR(free(results, stream)); + // HANDLE_CUDA_ERROR(free(target_buckets, stream)); + nof_bms = bitsize; + cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream); + NUM_THREADS = 32; + NUM_BLOCKS = (nof_bms + NUM_THREADS - 1) / NUM_THREADS; + last_pass_kernel<<>>(target_buckets,final_results,nof_bms); + // for (int k=0;k h_final_results; + // h_final_results.reserve(nof_bms); + // cudaMemcpy(h_final_results.data(), final_results, sizeof(P) * nof_bms, cudaMemcpyDeviceToHost); + // std::cout<<"buckets summed"<<<<1,1,0,stream>>>(final_results, on_device ? final_result : d_final_result, 1, nof_bms, c); - + final_accumulation_kernel<<<1,1,0,stream>>>(final_results, ones_results, on_device ? final_result : d_final_result, 1, nof_bms, c); + // cudaDeviceSynchronize(); +// printf("cuda error %u\n",cudaGetLastError()); //copy final result to host cudaStreamSynchronize(stream); if (!on_device) @@ -288,15 +1560,23 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi cudaFreeAsync(d_final_result, stream); } cudaFreeAsync(buckets, stream); + #ifndef PHASE1_TEST cudaFreeAsync(bucket_indices, stream); cudaFreeAsync(point_indices, stream); cudaFreeAsync(single_bucket_indices, stream); cudaFreeAsync(bucket_sizes, stream); cudaFreeAsync(nof_buckets_to_compute, stream); cudaFreeAsync(bucket_offsets, stream); + #endif + // cudaFreeAsync(sorted_bucket_sizes,stream); + // cudaFreeAsync(sorted_bucket_offsets,stream); + // cudaFreeAsync(sorted_single_bucket_indices,stream); cudaFreeAsync(final_results, stream); + cudaFreeAsync(ones_results, stream); cudaStreamSynchronize(stream); + + } //this function computes msm using the bucket method @@ -344,7 +1624,7 @@ void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *poin NUM_THREADS = 1 << 8; NUM_BLOCKS = (total_size * nof_bms + msm_size + NUM_THREADS - 1) / NUM_THREADS; split_scalars_kernel<<>>(bucket_indices + msm_size, point_indices + msm_size, d_scalars, total_size, - msm_log_size, nof_bms, bm_bitsize, c); //+size - leaving the first bm free for the out of place sort later + msm_log_size, nof_bms, bm_bitsize, c,0); //+size - leaving the first bm free for the out of place sort later //sort indices - the indices are sorted from smallest to largest in order to group together the points that belong to each bucket unsigned *sorted_bucket_indices; @@ -395,30 +1675,30 @@ void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *poin NUM_THREADS = 1 << 8; NUM_BLOCKS = (total_nof_buckets + NUM_THREADS - 1) / NUM_THREADS; accumulate_buckets_kernel<<>>(buckets, bucket_offsets, bucket_sizes, single_bucket_indices, sorted_point_indices, - d_points, nof_buckets, total_nof_buckets_to_compute, c+bm_bitsize); + d_points, nof_buckets, total_nof_buckets_to_compute, c+bm_bitsize,c); - #ifdef SSM_SUM - //sum each bucket - NUM_THREADS = 1 << 10; - NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS; - ssm_buckets_kernel<<>>(buckets, single_bucket_indices, nof_buckets, c); + // #ifdef SSM_SUM + // //sum each bucket + // NUM_THREADS = 1 << 10; + // NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS; + // ssm_buckets_kernel<<>>(buckets, single_bucket_indices, nof_buckets, c); - //sum each bucket module - P* final_results; - cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream); - NUM_THREADS = 1<>>(buckets, final_results); - #endif - - #ifdef BIG_TRIANGLE + // //sum each bucket module + // P* final_results; + // cudaMalloc(&final_results, sizeof(P) * nof_bms); + // NUM_THREADS = 1<>>(buckets, final_results); + // #endif + + // #ifdef BIG_TRIANGLE P* bm_sums; cudaMallocAsync(&bm_sums, sizeof(P) * nof_bms * batch_size, stream); //launch the bucket module sum kernel - a thread for each bucket module NUM_THREADS = 1<<8; NUM_BLOCKS = (nof_bms*batch_size + NUM_THREADS - 1) / NUM_THREADS; big_triangle_sum_kernel<<>>(buckets, bm_sums, nof_bms*batch_size, c); - #endif + // #endif P* d_final_results; if (!on_device) @@ -427,8 +1707,10 @@ void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *poin //launch the double and add kernel, a single thread for each msm NUM_THREADS = 1<<8; NUM_BLOCKS = (batch_size + NUM_THREADS - 1) / NUM_THREADS; - final_accumulation_kernel<<>>(bm_sums, on_device ? final_results : d_final_results, batch_size, nof_bms, c); + final_accumulation_kernel<<>>(bm_sums,bm_sums, on_device ? final_results : d_final_results, batch_size, nof_bms, c); + final_accumulation_kernel<<>>(bm_sums,bm_sums, on_device ? final_results : d_final_results, batch_size, nof_bms, c); + //copy final result to host if (!on_device) cudaMemcpyAsync(final_results, d_final_results, sizeof(P)*batch_size, cudaMemcpyDeviceToHost, stream); @@ -532,18 +1814,19 @@ void reference_msm(S* scalars, A* a_points, unsigned size){ unsigned get_optimal_c(const unsigned size) { if (size < 17) return 1; - // return 15; + // return 17; return ceil(log2(size))-4; } //this function is used to compute msms of size larger than 256 template -void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device, cudaStream_t stream){ - unsigned c = get_optimal_c(size); - // unsigned c = 6; - // unsigned bitsize = 32; - unsigned bitsize = 255; - bucket_method_msm(bitsize, c, scalars, points, size, result, on_device, stream); +void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device, bool big_triangle, cudaStream_t stream){ + // unsigned c = get_optimal_c(size); + unsigned c = 16; + // unsigned c = 8; + unsigned bitsize = S::NBITS; + // unsigned bitsize = 254; //get from field + bucket_method_msm(bitsize, c, scalars, points, size, result, on_device, big_triangle, stream); } // this function is used to compute a batches of msms of size larger than 256 @@ -555,4 +1838,4 @@ void batched_large_msm(S* scalars, A* points, unsigned batch_size, unsigned msm_ unsigned bitsize = 255; batched_bucket_method_msm(bitsize, c, scalars, points, batch_size, msm_size, result, on_device, stream); } -#endif +#endif \ No newline at end of file diff --git a/icicle/appUtils/msm/msm.cuh b/icicle/appUtils/msm/msm.cuh index c6e8b0566..7da8fdc65 100644 --- a/icicle/appUtils/msm/msm.cuh +++ b/icicle/appUtils/msm/msm.cuh @@ -3,7 +3,7 @@ #pragma once template -void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device, cudaStream_t stream); +void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device, bool big_triangle, cudaStream_t stream); template void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned batch_size, unsigned msm_size, P* final_results, bool on_device, cudaStream_t stream); @@ -12,7 +12,7 @@ template void batched_large_msm(S* scalars, A* points, unsigned batch_size, unsigned msm_size, P* result, bool on_device, cudaStream_t stream); template -void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device, cudaStream_t stream); +void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device, bool big_triangle, cudaStream_t stream); template void short_msm(S *h_scalars, A *h_points, unsigned size, P* h_final_result, cudaStream_t stream); diff --git a/icicle/appUtils/msm/tests/msm_test.cu b/icicle/appUtils/msm/tests/msm_test.cu index 5833e9cc3..e12d221d8 100644 --- a/icicle/appUtils/msm/tests/msm_test.cu +++ b/icicle/appUtils/msm/tests/msm_test.cu @@ -5,15 +5,27 @@ #include "../../utils/cuda_utils.cuh" #include "../../primitives/projective.cuh" #include "../../primitives/field.cuh" -#include "../../curves/bls12_381/curve_config.cuh" +// #include "../../curves/bls12_377/curve_config.cuh" +#include "../../curves/bn254/curve_config.cuh" -using namespace BLS12_381; +// using namespace BLS12_377; +using namespace BN254; class Dummy_Scalar { public: static constexpr unsigned NBITS = 32; unsigned x; + unsigned p = 10; + // unsigned p = 1<<30; + + static HOST_DEVICE_INLINE Dummy_Scalar zero() { + return {0}; + } + + static HOST_DEVICE_INLINE Dummy_Scalar one() { + return {1}; + } friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar) { os << scalar.x; @@ -25,7 +37,7 @@ class Dummy_Scalar { } friend HOST_DEVICE_INLINE Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2) { - return {p1.x+p2.x}; + return {(p1.x+p2.x)%p1.p}; } friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) { @@ -36,11 +48,12 @@ class Dummy_Scalar { return (p1.x == p2); } - // static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar &scalar) { - // return {Dummy_Scalar::neg(point.x)}; - // } + static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar &scalar) { + return {scalar.p-scalar.x}; + } static HOST_INLINE Dummy_Scalar rand_host() { - return {(unsigned)rand()}; + return {(unsigned)rand()%10}; + // return {(unsigned)rand()}; } }; @@ -53,6 +66,10 @@ class Dummy_Projective { return {0}; } + static HOST_DEVICE_INLINE Dummy_Projective one() { + return {1}; + } + static HOST_DEVICE_INLINE Dummy_Projective to_affine(const Dummy_Projective &point) { return {point.x}; } @@ -61,9 +78,9 @@ class Dummy_Projective { return {point.x}; } - // static HOST_DEVICE_INLINE Dummy_Projective neg(const Dummy_Projective &point) { - // return {Dummy_Scalar::neg(point.x)}; - // } + static HOST_DEVICE_INLINE Dummy_Projective neg(const Dummy_Projective &point) { + return {Dummy_Scalar::neg(point.x)}; + } friend HOST_DEVICE_INLINE Dummy_Projective operator+(Dummy_Projective p1, const Dummy_Projective& p2) { return {p1.x+p2.x}; @@ -103,7 +120,8 @@ class Dummy_Projective { } static HOST_INLINE Dummy_Projective rand_host() { - return {(unsigned)rand()}; + return {(unsigned)rand()%10}; + // return {(unsigned)rand()}; } }; @@ -119,62 +137,99 @@ typedef affine_t test_affine; int main() { - unsigned batch_size = 4; - unsigned msm_size = 1<<15; + unsigned batch_size = 1; +// unsigned msm_size = 1<<21; + unsigned msm_size = 12180757; unsigned N = batch_size*msm_size; test_scalar *scalars = new test_scalar[N]; test_affine *points = new test_affine[N]; for (unsigned i=0;i(scalars, points, N, short_res); - for (unsigned i=0;i(scalars+msm_size*i, points+msm_size*i, msm_size, large_res+i, false); + // for (unsigned i=0;i(scalars+msm_size*i, points+msm_size*i, msm_size, large_res+i, false); // std::cout<<"final result large"<(scalars, points, batch_size, msm_size, batched_large_res, false); + cudaStream_t stream1; + cudaStream_t stream2; + cudaStreamCreate(&stream1); + cudaStreamCreate(&stream2); + auto begin1 = std::chrono::high_resolution_clock::now(); + large_msm(scalars, points, msm_size, large_res, false, true,stream1); + auto end1 = std::chrono::high_resolution_clock::now(); + auto elapsed1 = std::chrono::duration_cast(end1 - begin1); + printf("Big Triangle : %.3f seconds.\n", elapsed1.count() * 1e-9); + // std::cout<(scalars, points, batch_size, msm_size, batched_large_res, false); - // large_msm(scalars, points, msm_size, large_res, false); + large_msm(scalars_d, points_d, msm_size, large_res_d, true, false,stream2); + // test_reduce_triangle(scalars); + // test_reduce_rectangle(scalars); + // test_reduce_single(scalars); + // test_reduce_var(scalars); auto end = std::chrono::high_resolution_clock::now(); auto elapsed = std::chrono::duration_cast(end - begin); - printf("Time measured: %.3f seconds.\n", elapsed.count() * 1e-9); - std::cout<(scalars, points, msm_size); + std::cout<(scalars, points, msm_size); + + // std::cout<<"final results batched large"< __global__ void add_sub_array(E* res, E* in1, E* in2, uint32_t n) { + int tid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tid < n) { + res[tid] = SUB ? in1[tid] - in2[tid] : in1[tid] + in2[tid]; + } + } + + template + int sub_polys(E* d_out, E* d_in1, E* d_in2, unsigned n, cudaStream_t stream) { + uint32_t NUM_THREADS = MAX_THREADS_BATCH; + uint32_t NUM_BLOCKS = (n + NUM_THREADS - 1) / NUM_THREADS; + + add_sub_array <<>>(d_out, d_in1, d_in2, n); + + return 0; + } + + template + int add_polys(E* d_out, E* d_in1, E* d_in2, unsigned n, cudaStream_t stream) { + uint32_t NUM_THREADS = MAX_THREADS_BATCH; + uint32_t NUM_BLOCKS = (n + NUM_THREADS - 1) / NUM_THREADS; + + add_sub_array <<>>(d_out, d_in1, d_in2, n); + + return 0; + } + /** * Interpolate a batch of polynomials from their evaluations on the same subgroup. * Note: this function does not preform any bit-reverse permutations on its inputs or outputs. @@ -14,9 +41,9 @@ * @param n Length of `d_domain` array, also equal to the number of evaluations of each polynomial. * @param batch_size The size of the batch; the length of `d_evaluations` is `n` * `batch_size`. */ -template int interpolate_batch(E * d_out, E * d_evaluations, S * d_domain, unsigned n, unsigned batch_size, cudaStream_t stream) { +template int interpolate_batch(E * d_out, E * d_evaluations, S * d_domain, unsigned n, unsigned batch_size, bool coset, S * coset_powers, cudaStream_t stream) { cudaMemcpyAsync(d_out, d_evaluations, sizeof(E) * n * batch_size, cudaMemcpyDeviceToDevice, stream); - ntt_inplace_batch_template(d_out, d_domain, n, batch_size, true, stream, true); + ntt_inplace_batch_template(d_out, d_domain, n, batch_size, true, coset, coset_powers, stream, true); return 0; } @@ -28,8 +55,8 @@ template int interpolate_batch(E * d_out, E * d_evaluat * @param d_domain Domain on which the polynomial is evaluated. Must be a subgroup. * @param n Length of `d_evaluations` and the size `d_domain` arrays (they should have equal length). */ -template int interpolate(E * d_out, E * d_evaluations, S * d_domain, unsigned n, cudaStream_t stream) { - return interpolate_batch (d_out, d_evaluations, d_domain, n, 1, stream); +template int interpolate(E * d_out, E * d_evaluations, S * d_domain, unsigned n, bool coset, S * coset_powers, cudaStream_t stream) { + return interpolate_batch (d_out, d_evaluations, d_domain, n, 1, coset, coset_powers, stream); } template < typename E > __global__ void fill_array(E * arr, E val, uint32_t n) { @@ -73,8 +100,9 @@ int evaluate_batch(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_ if (coset) batch_vector_mult(coset_powers, d_out, domain_size, batch_size, stream); - - ntt_inplace_batch_template(d_out, d_domain, domain_size, batch_size, false, stream, true); + + S* _null = nullptr; + ntt_inplace_batch_template(d_out, d_domain, domain_size, batch_size, false, false, _null, stream, true); return 0; } @@ -96,22 +124,26 @@ int evaluate(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_size, template int interpolate_scalars(S* d_out, S* d_evaluations, S* d_domain, unsigned n, cudaStream_t stream) { - return interpolate(d_out, d_evaluations, d_domain, n, stream); + S* _null = nullptr; + return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream); } template int interpolate_scalars_batch(S* d_out, S* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream) { - return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream); + S* _null = nullptr; + return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream); } template int interpolate_points(E* d_out, E* d_evaluations, S* d_domain, unsigned n, cudaStream_t stream) { - return interpolate(d_out, d_evaluations, d_domain, n, stream); + S* _null = nullptr; + return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream); } template int interpolate_points_batch(E* d_out, E* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream) { - return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream); + S* _null = nullptr; + return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream); } template @@ -139,6 +171,18 @@ int evaluate_points_batch(E* d_out, E* d_coefficients, S* d_domain, return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream); } +template +int interpolate_scalars_on_coset(S* d_out, S* d_evaluations, S* d_domain, + unsigned n, S* coset_powers, cudaStream_t stream) { + return interpolate(d_out, d_evaluations, d_domain, n, true, coset_powers, stream); +} + +template +int interpolate_scalars_on_coset_batch(S* d_out, S* d_evaluations, S* d_domain, + unsigned n, unsigned batch_size, S* coset_powers, cudaStream_t stream) { + return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, true, coset_powers, stream); +} + template int evaluate_scalars_on_coset(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, S* coset_powers, cudaStream_t stream) { diff --git a/icicle/appUtils/ntt/ntt.cuh b/icicle/appUtils/ntt/ntt.cuh index bb53e97f1..5456911dd 100644 --- a/icicle/appUtils/ntt/ntt.cuh +++ b/icicle/appUtils/ntt/ntt.cuh @@ -3,6 +3,7 @@ #pragma once #include "../../utils/sharedmem.cuh" +#include "../vector_manipulation/ve_mod_mult.cuh" const uint32_t MAX_NUM_THREADS = 1024; const uint32_t MAX_THREADS_BATCH = 512; //TODO: allows 100% occupancy for scalar NTT for sm_86..sm_89 @@ -83,19 +84,11 @@ template < typename T > void reverse_order(T* arr, uint32_t n, uint32_t logn, cu } -/** - * Multiply the elements of an input array by a scalar in-place. - * @param arr input array. - * @param n size of arr. - * @param n_inv scalar of type S (scalar). - */ -template < typename E, typename S > __global__ void template_normalize_kernel(E * arr, uint32_t n, S scalar) { - int tid = (blockIdx.x * blockDim.x) + threadIdx.x; - if (tid < n) { - arr[tid] = scalar * arr[tid]; - } -} - +enum Decimation { + NONE = 0, + DIF = 1, + DIT = 2, +}; /** * Cooley-Tuckey NTT. @@ -288,8 +281,16 @@ __global__ void ntt_template_kernel(E *arr, uint32_t n, S *twiddles, uint32_t n_ * @param d_twiddles * @param n Length of `d_twiddles` array * @param batch_size The size of the batch; the length of `d_inout` is `n` * `batch_size`. + * @param inverse true for iNTT + * @param is_coset true for multiplication by coset + * @param coset should be array of lenght n - or in case of lesser than n, right-padded with zeroes + * @param stream CUDA stream + * @param is_sync_needed do perform sync of the supplied CUDA stream at the end of processing */ -template void ntt_inplace_batch_template(E * d_inout, S * d_twiddles, unsigned n, unsigned batch_size, bool inverse, cudaStream_t stream, bool is_sync_needed) { +template void ntt_inplace_batch_template( + E * d_inout, S * d_twiddles, unsigned n, unsigned batch_size, bool inverse, + bool is_coset, S * coset, cudaStream_t stream, bool is_sync_needed) +{ const int logn = int(log(n) / log(2)); bool is_shared_mem_enabled = sizeof(E) <= MAX_SHARED_MEM_ELEMENT_SIZE; const int log2_shmem_elems = is_shared_mem_enabled ? int(log(int(MAX_SHARED_MEM / sizeof(E))) / log(2)) : logn; @@ -309,12 +310,16 @@ template void ntt_inplace_batch_template(E * d_inout, S ntt_template_kernel <<>>(d_inout, n, d_twiddles, n, total_tasks, s, false); } + if (is_coset) batch_vector_mult(coset, d_inout, n, batch_size, stream); + num_threads = min(n / 2, MAX_NUM_THREADS); num_blocks = (n * batch_size + num_threads - 1) / num_threads; template_normalize_kernel <<>> (d_inout, n * batch_size, S::inv_log_size(logn)); } else { + if (is_coset) batch_vector_mult(coset, d_inout, n, batch_size, stream); + for (int s = logn - 1; s >= logn_shmem; s--) // TODO: this loop also can be unrolled { ntt_template_kernel<<>>(d_inout, n, d_twiddles, n, total_tasks, s, true); @@ -353,8 +358,9 @@ template void ntt_inplace_batch_template(E * d_inout, S cudaMemcpyAsync(d_arr, arr, size_E, cudaMemcpyHostToDevice, stream); int NUM_THREADS = MAX_THREADS_BATCH; int NUM_BLOCKS = (batches + NUM_THREADS - 1) / NUM_THREADS; - - ntt_inplace_batch_template(d_arr, d_twiddles, n, batches, inverse, stream, false); + + S* _null = nullptr; + ntt_inplace_batch_template(d_arr, d_twiddles, n, batches, inverse, false, _null, stream, false); cudaMemcpyAsync(arr, d_arr, size_E, cudaMemcpyDeviceToHost, stream); cudaFreeAsync(d_arr, stream); diff --git a/icicle/appUtils/vector_manipulation/ve_mod_mult.cuh b/icicle/appUtils/vector_manipulation/ve_mod_mult.cuh index 6bbf9a40a..236ad0079 100644 --- a/icicle/appUtils/vector_manipulation/ve_mod_mult.cuh +++ b/icicle/appUtils/vector_manipulation/ve_mod_mult.cuh @@ -7,6 +7,19 @@ #define MAX_THREADS_PER_BLOCK 256 +/** + * Multiply the elements of an input array by a scalar in-place. + * @param arr input array. + * @param n size of arr. + * @param n_inv scalar of type S (scalar). + */ + template < typename E, typename S > __global__ void template_normalize_kernel(E * arr, uint32_t n, S scalar) { + int tid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tid < n) { + arr[tid] = scalar * arr[tid]; + } + } + // TODO: headers for prototypes and .c .cpp .cu files for implementations template __global__ void vectorModMult(S *scalar_vec, E *element_vec, E *result, size_t n_elments) @@ -49,6 +62,18 @@ int vector_mod_mult(S *vec_a, E *vec_b, E *result, size_t n_elments, cudaStream_ return 0; } +template +int vector_mod_mult_device(S *d_vec_a, E *d_vec_b, E *d_result, size_t n_elments) // TODO: in place so no need for third result vector +{ + // Set the grid and block dimensions + int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK); + int threads_per_block = MAX_THREADS_PER_BLOCK; + + // Call the kernel to perform element-wise modular multiplication + vectorModMult<<>>(d_vec_a, d_vec_b, d_result, n_elments); + return 0; +} + template __global__ void batchVectorMult(S *scalar_vec, E *element_vec, unsigned n_scalars, unsigned batch_size) { diff --git a/icicle/curves/bls12_377/c_api.h b/icicle/curves/bls12_377/c_api.h new file mode 100644 index 000000000..34d1aa10f --- /dev/null +++ b/icicle/curves/bls12_377/c_api.h @@ -0,0 +1,33 @@ + +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + + +#include +#include +// c_api.h + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct BLS12377_projective_t BLS12377_projective_t; + +bool eq_bls12_377(BLS12377_projective_t *point1, BLS12377_projective_t *point2, size_t device_id); + +#ifdef __cplusplus +} +#endif diff --git a/icicle/curves/bls12_377/curve_config.cuh b/icicle/curves/bls12_377/curve_config.cuh index 1b1c95c4a..367fd061a 100644 --- a/icicle/curves/bls12_377/curve_config.cuh +++ b/icicle/curves/bls12_377/curve_config.cuh @@ -2,6 +2,9 @@ #include "../../primitives/field.cuh" #include "../../primitives/projective.cuh" +#if defined(G2_DEFINED) +#include "../../primitives/extension_field.cuh" +#endif #include "params.cuh" diff --git a/icicle/curves/bls12_377/lde.cu b/icicle/curves/bls12_377/lde.cu index e7e8b15f5..b4fcc80cf 100644 --- a/icicle/curves/bls12_377/lde.cu +++ b/icicle/curves/bls12_377/lde.cu @@ -24,7 +24,7 @@ extern "C" BLS12_377::scalar_t* build_domain_cuda_bls12_377(uint32_t domain_size } } -extern "C" int ntt_cuda_bls12_377(BLS12_377::scalar_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) +extern "C" int ntt_cuda_bls12_377(BLS12_377::scalar_t *arr, uint32_t n, bool inverse, Decimation decimation, size_t device_id = 0, cudaStream_t stream = 0) { try { @@ -39,7 +39,7 @@ extern "C" int ntt_cuda_bls12_377(BLS12_377::scalar_t *arr, uint32_t n, bool inv } } -extern "C" int ecntt_cuda_bls12_377(BLS12_377::projective_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) +extern "C" int ecntt_cuda_bls12_377(BLS12_377::projective_t *arr, uint32_t n, bool inverse, Decimation decimation, size_t device_id = 0, cudaStream_t stream = 0) { try { @@ -85,7 +85,8 @@ extern "C" int interpolate_scalars_cuda_bls12_377(BLS12_377::scalar_t* d_out, BL { try { - return interpolate(d_out, d_evaluations, d_domain, n, stream); + BLS12_377::scalar_t* _null = nullptr; + return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -99,8 +100,9 @@ extern "C" int interpolate_scalars_batch_cuda_bls12_377(BLS12_377::scalar_t* d_o { try { + BLS12_377::scalar_t* _null = nullptr; cudaStreamCreate(&stream); - return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream); + return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -113,7 +115,8 @@ extern "C" int interpolate_points_cuda_bls12_377(BLS12_377::projective_t* d_out, { try { - return interpolate(d_out, d_evaluations, d_domain, n, stream); + BLS12_377::scalar_t* _null = nullptr; + return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -127,8 +130,9 @@ extern "C" int interpolate_points_batch_cuda_bls12_377(BLS12_377::projective_t* { try { + BLS12_377::scalar_t* _null = nullptr; cudaStreamCreate(&stream); - return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream); + return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -267,7 +271,8 @@ extern "C" int ntt_inplace_batch_cuda_bls12_377(BLS12_377::scalar_t* d_inout, BL try { cudaStreamCreate(&stream); - ntt_inplace_batch_template(d_inout, d_twiddles, n, batch_size, inverse, stream, true); + BLS12_377::scalar_t* _null = nullptr; + ntt_inplace_batch_template(d_inout, d_twiddles, n, batch_size, inverse, false, _null, stream, true); return CUDA_SUCCESS; //TODO: we should implement this https://leimao.github.io/blog/Proper-CUDA-Error-Checking/ } catch (const std::runtime_error &ex) diff --git a/icicle/curves/bls12_377/msm.cu b/icicle/curves/bls12_377/msm.cu index 73332ddbe..77d44a32f 100644 --- a/icicle/curves/bls12_377/msm.cu +++ b/icicle/curves/bls12_377/msm.cu @@ -12,7 +12,7 @@ int msm_cuda_bls12_377(BLS12_377::projective_t *out, BLS12_377::affine_t points[ { try { - large_msm(scalars, points, count, out, false, stream); + large_msm(scalars, points, count, out, false, false, stream); return CUDA_SUCCESS; } catch (const std::runtime_error &ex) @@ -53,7 +53,7 @@ extern "C" int msm_batch_cuda_bls12_377(BLS12_377::projective_t* out, BLS12_377: { try { - large_msm(d_scalars, d_points, count, d_out, true, stream); + large_msm(d_scalars, d_points, count, d_out, true, false, stream); cudaStreamSynchronize(stream); return 0; } diff --git a/icicle/curves/bls12_377/msm.h b/icicle/curves/bls12_377/msm.h new file mode 100644 index 000000000..fdfcd7418 --- /dev/null +++ b/icicle/curves/bls12_377/msm.h @@ -0,0 +1,53 @@ + +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + + +#include +#include +// msm.h + +#ifndef _BLS12377_MSM_H +#define _BLS12377_MSM_H + +#ifdef __cplusplus +extern "C" { +#endif + +// Incomplete declaration of BLS12377 projective and affine structs +typedef struct BLS12377_projective_t BLS12377_projective_t; +typedef struct BLS12377_affine_t BLS12377_affine_t; +typedef struct BLS12377_scalar_t BLS12377_scalar_t; + +int msm_cuda_bls12_377(BLS12377_projective_t* out, BLS12377_affine_t* points, + BLS12377_scalar_t* scalars, size_t count, size_t device_id); + +int msm_batch_cuda_bls12_377(BLS12377_projective_t* out, BLS12377_affine_t* points, + BLS12377_scalar_t* scalars, size_t batch_size, + size_t msm_size, size_t device_id); + +int commit_cuda_bls12_377(BLS12377_projective_t* d_out, BLS12377_scalar_t* d_scalars, + BLS12377_affine_t* d_points, size_t count, size_t device_id); + +int commit_batch_cuda_bls12_377(BLS12377_projective_t* d_out, BLS12377_scalar_t* d_scalars, + BLS12377_affine_t* d_points, size_t count, + size_t batch_size, size_t device_id); + +#ifdef __cplusplus +} +#endif + +#endif /* _BLS12377_MSM_H */ diff --git a/icicle/curves/bls12_377/ntt.h b/icicle/curves/bls12_377/ntt.h new file mode 100644 index 000000000..19842a7f9 --- /dev/null +++ b/icicle/curves/bls12_377/ntt.h @@ -0,0 +1,44 @@ + +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +#include +#include +// ntt.h + +#ifndef _BLS12377_NTT_H +#define _BLS12377_NTT_H + +#ifdef __cplusplus +extern "C" { +#endif + +// Incomplete declaration of BLS12377 projective and affine structs +typedef struct BLS12377_projective_t BLS12377_projective_t; +typedef struct BLS12377_affine_t BLS12377_affine_t; +typedef struct BLS12377_scalar_t BLS12377_scalar_t; + +int ntt_cuda_bls12_377(BLS12377_scalar_t *arr, uint32_t n, bool inverse, size_t decimation, size_t device_id); +int ntt_batch_cuda_bls12_377(BLS12377_scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id); + +int ecntt_cuda_bls12_377(BLS12377_projective_t *arr, uint32_t n, bool inverse, size_t device_id); +int ecntt_batch_cuda_bls12_377(BLS12377_projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id); + +#ifdef __cplusplus +} +#endif + +#endif /* _BLS12377_NTT_H */ diff --git a/icicle/curves/bls12_377/params.cuh b/icicle/curves/bls12_377/params.cuh index a60375592..bd1de1084 100644 --- a/icicle/curves/bls12_377/params.cuh +++ b/icicle/curves/bls12_377/params.cuh @@ -153,7 +153,7 @@ namespace PARAMS_BLS12_377{ static constexpr storage one = {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; static constexpr storage zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; // i^2, the square of the imaginary unit for the extension field - static constexpr uint32_t i_squared = 1; + static constexpr uint32_t i_squared = 5; // true if i^2 is negative static constexpr bool i_squared_is_negative = true; // G1 and G2 generators diff --git a/icicle/curves/bls12_377/ve_mod_mult.h b/icicle/curves/bls12_377/ve_mod_mult.h new file mode 100644 index 000000000..0da1817c6 --- /dev/null +++ b/icicle/curves/bls12_377/ve_mod_mult.h @@ -0,0 +1,41 @@ + +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +#include +#include +// ve_mod_mult.h + +#ifndef _BLS12377_VEC_MULT_H +#define _BLS12377_VEC_MULT_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct BLS12377_projective_t BLS12377_projective_t; +typedef struct BLS12377_scalar_t BLS12377_scalar_t; + +int32_t vec_mod_mult_point_bls12_377(BLS12377_projective_t *inout, BLS12377_scalar_t *scalar_vec, size_t n_elments, size_t device_id); +int32_t vec_mod_mult_scalar_bls12_377(BLS12377_scalar_t *inout, BLS12377_scalar_t *scalar_vec, size_t n_elments, size_t device_id); +int32_t matrix_vec_mod_mult_bls12_377(BLS12377_scalar_t *matrix_flattened, BLS12377_scalar_t *input, BLS12377_scalar_t *output, size_t n_elments, size_t device_id); + + +#ifdef __cplusplus +} +#endif + +#endif /* _BLS12377_VEC_MULT_H */ diff --git a/icicle/curves/bls12_381/c_api.h b/icicle/curves/bls12_381/c_api.h new file mode 100644 index 000000000..605628550 --- /dev/null +++ b/icicle/curves/bls12_381/c_api.h @@ -0,0 +1,32 @@ + +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +#include +#include +// c_api.h + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct BLS12381_projective_t BLS12381_projective_t; + +bool eq_bls12_381(BLS12381_projective_t *point1, BLS12381_projective_t *point2, size_t device_id); + +#ifdef __cplusplus +} +#endif diff --git a/icicle/curves/bls12_381/curve_config.cuh b/icicle/curves/bls12_381/curve_config.cuh index c1f6781ea..24951fa56 100644 --- a/icicle/curves/bls12_381/curve_config.cuh +++ b/icicle/curves/bls12_381/curve_config.cuh @@ -2,6 +2,9 @@ #include "../../primitives/field.cuh" #include "../../primitives/projective.cuh" +#if defined(G2_DEFINED) +#include "../../primitives/extension_field.cuh" +#endif #include "params.cuh" diff --git a/icicle/curves/bls12_381/lde.cu b/icicle/curves/bls12_381/lde.cu index a79f4a5be..7bd92f89b 100644 --- a/icicle/curves/bls12_381/lde.cu +++ b/icicle/curves/bls12_381/lde.cu @@ -24,7 +24,7 @@ extern "C" BLS12_381::scalar_t* build_domain_cuda_bls12_381(uint32_t domain_size } } -extern "C" int ntt_cuda_bls12_381(BLS12_381::scalar_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) +extern "C" int ntt_cuda_bls12_381(BLS12_381::scalar_t *arr, uint32_t n, bool inverse, Decimation decimation, size_t device_id = 0, cudaStream_t stream = 0) { try { @@ -39,7 +39,7 @@ extern "C" int ntt_cuda_bls12_381(BLS12_381::scalar_t *arr, uint32_t n, bool inv } } -extern "C" int ecntt_cuda_bls12_381(BLS12_381::projective_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) +extern "C" int ecntt_cuda_bls12_381(BLS12_381::projective_t *arr, uint32_t n, bool inverse, Decimation decimation, size_t device_id = 0, cudaStream_t stream = 0) { try { @@ -85,7 +85,8 @@ extern "C" int interpolate_scalars_cuda_bls12_381(BLS12_381::scalar_t* d_out, BL { try { - return interpolate(d_out, d_evaluations, d_domain, n, stream); + BLS12_381::scalar_t* _null = nullptr; + return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -99,11 +100,9 @@ extern "C" int interpolate_scalars_batch_cuda_bls12_381(BLS12_381::scalar_t* d_o { try { - cudaStreamCreate(&stream); //TODO: we should avoid creating stream if default (cudaStream_t stream = 0) is passed. - // but default is not working as expected as valgrind still reports errors - auto result_code = interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream); - cudaStreamDestroy(stream); //TODO: hotfix for not freeing memory - return result_code; + BLS12_381::scalar_t* _null = nullptr; + cudaStreamCreate(&stream); + return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -116,7 +115,8 @@ extern "C" int interpolate_points_cuda_bls12_381(BLS12_381::projective_t* d_out, { try { - return interpolate(d_out, d_evaluations, d_domain, n, stream); + BLS12_381::scalar_t* _null = nullptr; + return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -130,10 +130,9 @@ extern "C" int interpolate_points_batch_cuda_bls12_381(BLS12_381::projective_t* { try { + BLS12_381::scalar_t* _null = nullptr; cudaStreamCreate(&stream); - auto result_code = interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream); - cudaStreamDestroy(stream); - return result_code; + return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -276,7 +275,8 @@ extern "C" int ntt_inplace_batch_cuda_bls12_381(BLS12_381::scalar_t* d_inout, BL try { cudaStreamCreate(&stream); - ntt_inplace_batch_template(d_inout, d_twiddles, n, batch_size, inverse, stream, true); + BLS12_381::scalar_t* _null = nullptr; + ntt_inplace_batch_template(d_inout, d_twiddles, n, batch_size, inverse, false, _null, stream, true); return CUDA_SUCCESS; //TODO: we should implement this https://leimao.github.io/blog/Proper-CUDA-Error-Checking/ } catch (const std::runtime_error &ex) diff --git a/icicle/curves/bls12_381/msm.cu b/icicle/curves/bls12_381/msm.cu index c32efa45f..4be352e29 100644 --- a/icicle/curves/bls12_381/msm.cu +++ b/icicle/curves/bls12_381/msm.cu @@ -12,7 +12,7 @@ int msm_cuda_bls12_381(BLS12_381::projective_t *out, BLS12_381::affine_t points[ { try { - large_msm(scalars, points, count, out, false, stream); + large_msm(scalars, points, count, out, false, false, stream); return CUDA_SUCCESS; } catch (const std::runtime_error &ex) @@ -52,7 +52,7 @@ extern "C" int msm_batch_cuda_bls12_381(BLS12_381::projective_t* out, BLS12_381: { try { - large_msm(d_scalars, d_points, count, d_out, true, stream); + large_msm(d_scalars, d_points, count, d_out, true, false, stream); cudaStreamSynchronize(stream); return 0; } diff --git a/icicle/curves/bls12_381/msm.h b/icicle/curves/bls12_381/msm.h new file mode 100644 index 000000000..2e78a083f --- /dev/null +++ b/icicle/curves/bls12_381/msm.h @@ -0,0 +1,53 @@ + +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + + +#include +#include +// msm.h + +#ifndef _BLS12381_MSM_H +#define _BLS12381_MSM_H + +#ifdef __cplusplus +extern "C" { +#endif + +// Incomplete declaration of BLS12381 projective and affine structs +typedef struct BLS12381_projective_t BLS12381_projective_t; +typedef struct BLS12381_affine_t BLS12381_affine_t; +typedef struct BLS12381_scalar_t BLS12381_scalar_t; + +int msm_cuda_bls12_381(BLS12381_projective_t* out, BLS12381_affine_t* points, + BLS12381_scalar_t* scalars, size_t count, size_t device_id); + +int msm_batch_cuda_bls12_381(BLS12381_projective_t* out, BLS12381_affine_t* points, + BLS12381_scalar_t* scalars, size_t batch_size, + size_t msm_size, size_t device_id); + +int commit_cuda_bls12_381(BLS12381_projective_t* d_out, BLS12381_scalar_t* d_scalars, + BLS12381_affine_t* d_points, size_t count, size_t device_id); + +int commit_batch_cuda_bls12_381(BLS12381_projective_t* d_out, BLS12381_scalar_t* d_scalars, + BLS12381_affine_t* d_points, size_t count, + size_t batch_size, size_t device_id); + +#ifdef __cplusplus +} +#endif + +#endif /* _BLS12381_MSM_H */ diff --git a/icicle/curves/bls12_381/ntt.h b/icicle/curves/bls12_381/ntt.h new file mode 100644 index 000000000..3e4ac4054 --- /dev/null +++ b/icicle/curves/bls12_381/ntt.h @@ -0,0 +1,44 @@ + +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +#include +#include +// ntt.h + +#ifndef _BLS12381_NTT_H +#define _BLS12381_NTT_H + +#ifdef __cplusplus +extern "C" { +#endif + +// Incomplete declaration of BLS12381 projective and affine structs +typedef struct BLS12381_projective_t BLS12381_projective_t; +typedef struct BLS12381_affine_t BLS12381_affine_t; +typedef struct BLS12381_scalar_t BLS12381_scalar_t; + +int ntt_cuda_bls12_381(BLS12381_scalar_t *arr, uint32_t n, bool inverse, size_t decimation, size_t device_id); +int ntt_batch_cuda_bls12_381(BLS12381_scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id); + +int ecntt_cuda_bls12_381(BLS12381_projective_t *arr, uint32_t n, bool inverse, size_t device_id); +int ecntt_batch_cuda_bls12_381(BLS12381_projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id); + +#ifdef __cplusplus +} +#endif + +#endif /* _BLS12381_NTT_H */ diff --git a/icicle/curves/bls12_381/params.cuh b/icicle/curves/bls12_381/params.cuh index 3589bd577..7de524dcd 100644 --- a/icicle/curves/bls12_381/params.cuh +++ b/icicle/curves/bls12_381/params.cuh @@ -21,6 +21,9 @@ namespace PARAMS_BLS12_381{ // 2*modulus^2 static constexpr storage<2*limbs_count> modulus_squared_2 = {0x00000002, 0xfffffffc, 0xfff96ffd, 0x4efd200f, 0x39b7600b, 0xd315c004, 0xa867ef70, 0x915482bc, 0x95538cc2, 0x84c23ede, 0xb326943b, 0x1d2b27f2, 0xde59841e, 0xa41827b7, 0xe9784ef0, 0x68fec1e7}; + // note: doesn't actually fit into 384 bits, and shouldn't be used! is added for compilation + static constexpr storage<2*limbs_count> modulus_squared_4 = {0x00000002, 0xfffffffc, 0xfff96ffd, 0x4efd200f, 0x39b7600b, 0xd315c004, 0xa867ef70, 0x915482bc, + 0x95538cc2, 0x84c23ede, 0xb326943b, 0x1d2b27f2, 0xde59841e, 0xa41827b7, 0xe9784ef0, 0x68fec1e7}; static constexpr unsigned modulus_bit_count = 255; // m = floor(2^(2*modulus_bit_count) / modulus) static constexpr storage m = {0x830358e4, 0x509cde80, 0x2f92eb5c, 0xd9410fad, 0xc1f823b4, 0xe2d772d, 0x7fb78ddf, 0x8d54253b}; diff --git a/icicle/curves/bls12_381/supported_operations.cu b/icicle/curves/bls12_381/supported_operations.cu index 314e9f719..11be2dbda 100644 --- a/icicle/curves/bls12_381/supported_operations.cu +++ b/icicle/curves/bls12_381/supported_operations.cu @@ -2,4 +2,4 @@ #include "lde.cu" #include "msm.cu" #include "ve_mod_mult.cu" -#include "poseidon.cu" \ No newline at end of file +#include "poseidon.cu" diff --git a/icicle/curves/bls12_381/ve_mod_mult.h b/icicle/curves/bls12_381/ve_mod_mult.h new file mode 100644 index 000000000..05627ebc7 --- /dev/null +++ b/icicle/curves/bls12_381/ve_mod_mult.h @@ -0,0 +1,41 @@ + +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +#include +#include +// ve_mod_mult.h + +#ifndef _BLS12381_VEC_MULT_H +#define _BLS12381_VEC_MULT_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct BLS12381_projective_t BLS12381_projective_t; +typedef struct BLS12381_scalar_t BLS12381_scalar_t; + +int32_t vec_mod_mult_point_bls12_381(BLS12381_projective_t *inout, BLS12381_scalar_t *scalar_vec, size_t n_elments, size_t device_id); +int32_t vec_mod_mult_scalar_bls12_381(BLS12381_scalar_t *inout, BLS12381_scalar_t *scalar_vec, size_t n_elments, size_t device_id); +int32_t matrix_vec_mod_mult_bls12_381(BLS12381_scalar_t *matrix_flattened, BLS12381_scalar_t *input, BLS12381_scalar_t *output, size_t n_elments, size_t device_id); + + +#ifdef __cplusplus +} +#endif + +#endif /* _BLS12381_VEC_MULT_H */ diff --git a/icicle/curves/bn254/c_api.h b/icicle/curves/bn254/c_api.h new file mode 100644 index 000000000..dde669012 --- /dev/null +++ b/icicle/curves/bn254/c_api.h @@ -0,0 +1,34 @@ + +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +#include +#include +// c_api.h + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct BN254_projective_t BN254_projective_t; +typedef struct BN254_g2_projective_t BN254_g2_projective_t; + +bool eq_bn254(BN254_projective_t *point1, BN254_projective_t *point2); +bool eq_g2_bn254(BN254_g2_projective_t *point1, BN254_g2_projective_t *point2); + +#ifdef __cplusplus +} +#endif diff --git a/icicle/curves/bn254/cuda.h b/icicle/curves/bn254/cuda.h new file mode 100644 index 000000000..fc05e1b1d --- /dev/null +++ b/icicle/curves/bn254/cuda.h @@ -0,0 +1,14752 @@ +/* + * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef __cuda_cuda_h__ +#define __cuda_cuda_h__ + +#include +#ifdef _MSC_VER +typedef unsigned __int32 cuuint32_t; +typedef unsigned __int64 cuuint64_t; +#else +#include +typedef uint32_t cuuint32_t; +typedef uint64_t cuuint64_t; +#endif + +/** + * CUDA API versioning support + */ +#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED) +#define __CUDA_DEPRECATED +#elif defined(_MSC_VER) +#define __CUDA_DEPRECATED __declspec(deprecated) +#elif defined(__GNUC__) +#define __CUDA_DEPRECATED __attribute__((deprecated)) +#else +#define __CUDA_DEPRECATED +#endif + +#if defined(CUDA_FORCE_API_VERSION) + #if (CUDA_FORCE_API_VERSION == 3010) + #define __CUDA_API_VERSION 3010 + #else + #error "Unsupported value of CUDA_FORCE_API_VERSION" + #endif +#else + #define __CUDA_API_VERSION 10010 +#endif /* CUDA_FORCE_API_VERSION */ + +#if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) + #define __CUDA_API_PER_THREAD_DEFAULT_STREAM + #define __CUDA_API_PTDS(api) api ## _ptds + #define __CUDA_API_PTSZ(api) api ## _ptsz +#else + #define __CUDA_API_PTDS(api) api + #define __CUDA_API_PTSZ(api) api +#endif + +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 3020 + #define cuDeviceTotalMem cuDeviceTotalMem_v2 + #define cuCtxCreate cuCtxCreate_v2 + #define cuModuleGetGlobal cuModuleGetGlobal_v2 + #define cuMemGetInfo cuMemGetInfo_v2 + #define cuMemAlloc cuMemAlloc_v2 + #define cuMemAllocPitch cuMemAllocPitch_v2 + #define cuMemFree cuMemFree_v2 + #define cuMemGetAddressRange cuMemGetAddressRange_v2 + #define cuMemAllocHost cuMemAllocHost_v2 + #define cuMemHostGetDevicePointer cuMemHostGetDevicePointer_v2 + #define cuMemcpyHtoD __CUDA_API_PTDS(cuMemcpyHtoD_v2) + #define cuMemcpyDtoH __CUDA_API_PTDS(cuMemcpyDtoH_v2) + #define cuMemcpyDtoD __CUDA_API_PTDS(cuMemcpyDtoD_v2) + #define cuMemcpyDtoA __CUDA_API_PTDS(cuMemcpyDtoA_v2) + #define cuMemcpyAtoD __CUDA_API_PTDS(cuMemcpyAtoD_v2) + #define cuMemcpyHtoA __CUDA_API_PTDS(cuMemcpyHtoA_v2) + #define cuMemcpyAtoH __CUDA_API_PTDS(cuMemcpyAtoH_v2) + #define cuMemcpyAtoA __CUDA_API_PTDS(cuMemcpyAtoA_v2) + #define cuMemcpyHtoAAsync __CUDA_API_PTSZ(cuMemcpyHtoAAsync_v2) + #define cuMemcpyAtoHAsync __CUDA_API_PTSZ(cuMemcpyAtoHAsync_v2) + #define cuMemcpy2D __CUDA_API_PTDS(cuMemcpy2D_v2) + #define cuMemcpy2DUnaligned __CUDA_API_PTDS(cuMemcpy2DUnaligned_v2) + #define cuMemcpy3D __CUDA_API_PTDS(cuMemcpy3D_v2) + #define cuMemcpyHtoDAsync __CUDA_API_PTSZ(cuMemcpyHtoDAsync_v2) + #define cuMemcpyDtoHAsync __CUDA_API_PTSZ(cuMemcpyDtoHAsync_v2) + #define cuMemcpyDtoDAsync __CUDA_API_PTSZ(cuMemcpyDtoDAsync_v2) + #define cuMemcpy2DAsync __CUDA_API_PTSZ(cuMemcpy2DAsync_v2) + #define cuMemcpy3DAsync __CUDA_API_PTSZ(cuMemcpy3DAsync_v2) + #define cuMemsetD8 __CUDA_API_PTDS(cuMemsetD8_v2) + #define cuMemsetD16 __CUDA_API_PTDS(cuMemsetD16_v2) + #define cuMemsetD32 __CUDA_API_PTDS(cuMemsetD32_v2) + #define cuMemsetD2D8 __CUDA_API_PTDS(cuMemsetD2D8_v2) + #define cuMemsetD2D16 __CUDA_API_PTDS(cuMemsetD2D16_v2) + #define cuMemsetD2D32 __CUDA_API_PTDS(cuMemsetD2D32_v2) + #define cuArrayCreate cuArrayCreate_v2 + #define cuArrayGetDescriptor cuArrayGetDescriptor_v2 + #define cuArray3DCreate cuArray3DCreate_v2 + #define cuArray3DGetDescriptor cuArray3DGetDescriptor_v2 + #define cuTexRefSetAddress cuTexRefSetAddress_v2 + #define cuTexRefGetAddress cuTexRefGetAddress_v2 + #define cuGraphicsResourceGetMappedPointer cuGraphicsResourceGetMappedPointer_v2 +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 3020 */ +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 4000 + #define cuCtxDestroy cuCtxDestroy_v2 + #define cuCtxPopCurrent cuCtxPopCurrent_v2 + #define cuCtxPushCurrent cuCtxPushCurrent_v2 + #define cuStreamDestroy cuStreamDestroy_v2 + #define cuEventDestroy cuEventDestroy_v2 +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 4000 */ +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 4010 + #define cuTexRefSetAddress2D cuTexRefSetAddress2D_v3 +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 4010 */ +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 6050 + #define cuLinkCreate cuLinkCreate_v2 + #define cuLinkAddData cuLinkAddData_v2 + #define cuLinkAddFile cuLinkAddFile_v2 +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 6050 */ +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 6050 + #define cuMemHostRegister cuMemHostRegister_v2 + #define cuGraphicsResourceSetMapFlags cuGraphicsResourceSetMapFlags_v2 +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 6050 */ +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 10010 + #define cuStreamBeginCapture __CUDA_API_PTSZ(cuStreamBeginCapture_v2) +#elif defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM) + #define cuStreamBeginCapture __CUDA_API_PTSZ(cuStreamBeginCapture) +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 10010 */ + +#if !defined(__CUDA_API_VERSION_INTERNAL) +#if defined(__CUDA_API_VERSION) && __CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010 + #define cuTexRefSetAddress2D cuTexRefSetAddress2D_v2 +#endif /* __CUDA_API_VERSION && __CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010 */ +#endif /* __CUDA_API_VERSION_INTERNAL */ + +#if defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM) + #define cuMemcpy __CUDA_API_PTDS(cuMemcpy) + #define cuMemcpyAsync __CUDA_API_PTSZ(cuMemcpyAsync) + #define cuMemcpyPeer __CUDA_API_PTDS(cuMemcpyPeer) + #define cuMemcpyPeerAsync __CUDA_API_PTSZ(cuMemcpyPeerAsync) + #define cuMemcpy3DPeer __CUDA_API_PTDS(cuMemcpy3DPeer) + #define cuMemcpy3DPeerAsync __CUDA_API_PTSZ(cuMemcpy3DPeerAsync) + #define cuMemPrefetchAsync __CUDA_API_PTSZ(cuMemPrefetchAsync) + + #define cuMemsetD8Async __CUDA_API_PTSZ(cuMemsetD8Async) + #define cuMemsetD16Async __CUDA_API_PTSZ(cuMemsetD16Async) + #define cuMemsetD32Async __CUDA_API_PTSZ(cuMemsetD32Async) + #define cuMemsetD2D8Async __CUDA_API_PTSZ(cuMemsetD2D8Async) + #define cuMemsetD2D16Async __CUDA_API_PTSZ(cuMemsetD2D16Async) + #define cuMemsetD2D32Async __CUDA_API_PTSZ(cuMemsetD2D32Async) + + #define cuStreamGetPriority __CUDA_API_PTSZ(cuStreamGetPriority) + #define cuStreamGetFlags __CUDA_API_PTSZ(cuStreamGetFlags) + #define cuStreamGetCtx __CUDA_API_PTSZ(cuStreamGetCtx) + #define cuStreamWaitEvent __CUDA_API_PTSZ(cuStreamWaitEvent) + #define cuStreamEndCapture __CUDA_API_PTSZ(cuStreamEndCapture) + #define cuStreamIsCapturing __CUDA_API_PTSZ(cuStreamIsCapturing) + #define cuStreamGetCaptureInfo __CUDA_API_PTSZ(cuStreamGetCaptureInfo) + #define cuStreamAddCallback __CUDA_API_PTSZ(cuStreamAddCallback) + #define cuStreamAttachMemAsync __CUDA_API_PTSZ(cuStreamAttachMemAsync) + #define cuStreamQuery __CUDA_API_PTSZ(cuStreamQuery) + #define cuStreamSynchronize __CUDA_API_PTSZ(cuStreamSynchronize) + #define cuEventRecord __CUDA_API_PTSZ(cuEventRecord) + #define cuLaunchKernel __CUDA_API_PTSZ(cuLaunchKernel) + #define cuLaunchHostFunc __CUDA_API_PTSZ(cuLaunchHostFunc) + #define cuGraphicsMapResources __CUDA_API_PTSZ(cuGraphicsMapResources) + #define cuGraphicsUnmapResources __CUDA_API_PTSZ(cuGraphicsUnmapResources) + + #define cuStreamWriteValue32 __CUDA_API_PTSZ(cuStreamWriteValue32) + #define cuStreamWaitValue32 __CUDA_API_PTSZ(cuStreamWaitValue32) + #define cuStreamWriteValue64 __CUDA_API_PTSZ(cuStreamWriteValue64) + #define cuStreamWaitValue64 __CUDA_API_PTSZ(cuStreamWaitValue64) + #define cuStreamBatchMemOp __CUDA_API_PTSZ(cuStreamBatchMemOp) + + #define cuLaunchCooperativeKernel __CUDA_API_PTSZ(cuLaunchCooperativeKernel) + + #define cuSignalExternalSemaphoresAsync __CUDA_API_PTSZ(cuSignalExternalSemaphoresAsync) + #define cuWaitExternalSemaphoresAsync __CUDA_API_PTSZ(cuWaitExternalSemaphoresAsync) + + #define cuGraphLaunch __CUDA_API_PTSZ(cuGraphLaunch) +#endif + +/** + * \file cuda.h + * \brief Header file for the CUDA Toolkit application programming interface. + * + * \file cudaGL.h + * \brief Header file for the OpenGL interoperability functions of the + * low-level CUDA driver application programming interface. + * + * \file cudaD3D9.h + * \brief Header file for the Direct3D 9 interoperability functions of the + * low-level CUDA driver application programming interface. + */ + +/** + * \defgroup CUDA_TYPES Data types used by CUDA driver + * @{ + */ + +/** + * CUDA API version number + */ +#define CUDA_VERSION 10010 + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * CUDA device pointer + * CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform. + */ +#if __CUDA_API_VERSION >= 3020 + +#if defined(_WIN64) || defined(__LP64__) +typedef unsigned long long CUdeviceptr; +#else +typedef unsigned int CUdeviceptr; +#endif + +#endif /* __CUDA_API_VERSION >= 3020 */ + +typedef int CUdevice; /**< CUDA device */ +typedef struct CUctx_st *CUcontext; /**< CUDA context */ +typedef struct CUmod_st *CUmodule; /**< CUDA module */ +typedef struct CUfunc_st *CUfunction; /**< CUDA function */ +typedef struct CUarray_st *CUarray; /**< CUDA array */ +typedef struct CUmipmappedArray_st *CUmipmappedArray; /**< CUDA mipmapped array */ +typedef struct CUtexref_st *CUtexref; /**< CUDA texture reference */ +typedef struct CUsurfref_st *CUsurfref; /**< CUDA surface reference */ +typedef struct CUevent_st *CUevent; /**< CUDA event */ +typedef struct CUstream_st *CUstream; /**< CUDA stream */ +typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */ +typedef unsigned long long CUtexObject; /**< An opaque value that represents a CUDA texture object */ +typedef unsigned long long CUsurfObject; /**< An opaque value that represents a CUDA surface object */ +typedef struct CUextMemory_st *CUexternalMemory; /**< CUDA external memory */ +typedef struct CUextSemaphore_st *CUexternalSemaphore; /**< CUDA external semaphore */ +typedef struct CUgraph_st *CUgraph; /**< CUDA graph */ +typedef struct CUgraphNode_st *CUgraphNode; /**< CUDA graph node */ +typedef struct CUgraphExec_st *CUgraphExec; /**< CUDA executable graph */ + +#ifndef CU_UUID_HAS_BEEN_DEFINED +#define CU_UUID_HAS_BEEN_DEFINED +typedef struct CUuuid_st { /**< CUDA definition of UUID */ + char bytes[16]; +} CUuuid; +#endif + +#if __CUDA_API_VERSION >= 4010 + +/** + * CUDA IPC handle size + */ +#define CU_IPC_HANDLE_SIZE 64 + +/** + * CUDA IPC event handle + */ +typedef struct CUipcEventHandle_st { + char reserved[CU_IPC_HANDLE_SIZE]; +} CUipcEventHandle; + +/** + * CUDA IPC mem handle + */ +typedef struct CUipcMemHandle_st { + char reserved[CU_IPC_HANDLE_SIZE]; +} CUipcMemHandle; + +/** + * CUDA Ipc Mem Flags + */ +typedef enum CUipcMem_flags_enum { + CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */ +} CUipcMem_flags; + +#endif + +/** + * CUDA Mem Attach Flags + */ +typedef enum CUmemAttach_flags_enum { + CU_MEM_ATTACH_GLOBAL = 0x1, /**< Memory can be accessed by any stream on any device */ + CU_MEM_ATTACH_HOST = 0x2, /**< Memory cannot be accessed by any stream on any device */ + CU_MEM_ATTACH_SINGLE = 0x4 /**< Memory can only be accessed by a single stream on the associated device */ +} CUmemAttach_flags; + +/** + * Context creation flags + */ +typedef enum CUctx_flags_enum { + CU_CTX_SCHED_AUTO = 0x00, /**< Automatic scheduling */ + CU_CTX_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */ + CU_CTX_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */ + CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */ + CU_CTX_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling + * \deprecated This flag was deprecated as of CUDA 4.0 + * and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */ + CU_CTX_SCHED_MASK = 0x07, + CU_CTX_MAP_HOST = 0x08, /**< Support mapped pinned allocations */ + CU_CTX_LMEM_RESIZE_TO_MAX = 0x10, /**< Keep local memory allocation after launch */ + CU_CTX_FLAGS_MASK = 0x1f +} CUctx_flags; + +/** + * Stream creation flags + */ +typedef enum CUstream_flags_enum { + CU_STREAM_DEFAULT = 0x0, /**< Default stream flag */ + CU_STREAM_NON_BLOCKING = 0x1 /**< Stream does not synchronize with stream 0 (the NULL stream) */ +} CUstream_flags; + +/** + * Legacy stream handle + * + * Stream handle that can be passed as a CUstream to use an implicit stream + * with legacy synchronization behavior. + * + * See details of the \link_sync_behavior + */ +#define CU_STREAM_LEGACY ((CUstream)0x1) + +/** + * Per-thread stream handle + * + * Stream handle that can be passed as a CUstream to use an implicit stream + * with per-thread synchronization behavior. + * + * See details of the \link_sync_behavior + */ +#define CU_STREAM_PER_THREAD ((CUstream)0x2) + +/** + * Event creation flags + */ +typedef enum CUevent_flags_enum { + CU_EVENT_DEFAULT = 0x0, /**< Default event flag */ + CU_EVENT_BLOCKING_SYNC = 0x1, /**< Event uses blocking synchronization */ + CU_EVENT_DISABLE_TIMING = 0x2, /**< Event will not record timing data */ + CU_EVENT_INTERPROCESS = 0x4 /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */ +} CUevent_flags; + +#if __CUDA_API_VERSION >= 8000 +/** + * Flags for ::cuStreamWaitValue32 and ::cuStreamWaitValue64 + */ +typedef enum CUstreamWaitValue_flags_enum { + CU_STREAM_WAIT_VALUE_GEQ = 0x0, /**< Wait until (int32_t)(*addr - value) >= 0 (or int64_t for 64 bit + values). Note this is a cyclic comparison which ignores wraparound. + (Default behavior.) */ + CU_STREAM_WAIT_VALUE_EQ = 0x1, /**< Wait until *addr == value. */ + CU_STREAM_WAIT_VALUE_AND = 0x2, /**< Wait until (*addr & value) != 0. */ + CU_STREAM_WAIT_VALUE_NOR = 0x3, /**< Wait until ~(*addr | value) != 0. Support for this operation can be + queried with ::cuDeviceGetAttribute() and + ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.*/ + CU_STREAM_WAIT_VALUE_FLUSH = 1<<30 /**< Follow the wait operation with a flush of outstanding remote writes. This + means that, if a remote write operation is guaranteed to have reached the + device before the wait can be satisfied, that write is guaranteed to be + visible to downstream device work. The device is permitted to reorder + remote writes internally. For example, this flag would be required if + two remote writes arrive in a defined order, the wait is satisfied by the + second write, and downstream work needs to observe the first write. + Support for this operation is restricted to selected platforms and can be + queried with ::CU_DEVICE_ATTRIBUTE_CAN_USE_WAIT_VALUE_FLUSH.*/ +} CUstreamWaitValue_flags; + +/** + * Flags for ::cuStreamWriteValue32 + */ +typedef enum CUstreamWriteValue_flags_enum { + CU_STREAM_WRITE_VALUE_DEFAULT = 0x0, /**< Default behavior */ + CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER = 0x1 /**< Permits the write to be reordered with writes which were issued + before it, as a performance optimization. Normally, + ::cuStreamWriteValue32 will provide a memory fence before the + write, which has similar semantics to + __threadfence_system() but is scoped to the stream + rather than a CUDA thread. */ +} CUstreamWriteValue_flags; + +/** + * Operations for ::cuStreamBatchMemOp + */ +typedef enum CUstreamBatchMemOpType_enum { + CU_STREAM_MEM_OP_WAIT_VALUE_32 = 1, /**< Represents a ::cuStreamWaitValue32 operation */ + CU_STREAM_MEM_OP_WRITE_VALUE_32 = 2, /**< Represents a ::cuStreamWriteValue32 operation */ + CU_STREAM_MEM_OP_WAIT_VALUE_64 = 4, /**< Represents a ::cuStreamWaitValue64 operation */ + CU_STREAM_MEM_OP_WRITE_VALUE_64 = 5, /**< Represents a ::cuStreamWriteValue64 operation */ + CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3 /**< This has the same effect as ::CU_STREAM_WAIT_VALUE_FLUSH, but as a + standalone operation. */ +} CUstreamBatchMemOpType; + +/** + * Per-operation parameters for ::cuStreamBatchMemOp + */ +typedef union CUstreamBatchMemOpParams_union { + CUstreamBatchMemOpType operation; + struct CUstreamMemOpWaitValueParams_st { + CUstreamBatchMemOpType operation; + CUdeviceptr address; + union { + cuuint32_t value; + cuuint64_t value64; + }; + unsigned int flags; + CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */ + } waitValue; + struct CUstreamMemOpWriteValueParams_st { + CUstreamBatchMemOpType operation; + CUdeviceptr address; + union { + cuuint32_t value; + cuuint64_t value64; + }; + unsigned int flags; + CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */ + } writeValue; + struct CUstreamMemOpFlushRemoteWritesParams_st { + CUstreamBatchMemOpType operation; + unsigned int flags; + } flushRemoteWrites; + cuuint64_t pad[6]; +} CUstreamBatchMemOpParams; +#endif /* __CUDA_API_VERSION >= 8000 */ + +/** + * Occupancy calculator flag + */ +typedef enum CUoccupancy_flags_enum { + CU_OCCUPANCY_DEFAULT = 0x0, /**< Default behavior */ + CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1 /**< Assume global caching is enabled and cannot be automatically turned off */ +} CUoccupancy_flags; + +/** + * Array formats + */ +typedef enum CUarray_format_enum { + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */ + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */ + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */ + CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */ + CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */ + CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */ + CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */ + CU_AD_FORMAT_FLOAT = 0x20 /**< 32-bit floating point */ +} CUarray_format; + +/** + * Texture reference addressing modes + */ +typedef enum CUaddress_mode_enum { + CU_TR_ADDRESS_MODE_WRAP = 0, /**< Wrapping address mode */ + CU_TR_ADDRESS_MODE_CLAMP = 1, /**< Clamp to edge address mode */ + CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */ + CU_TR_ADDRESS_MODE_BORDER = 3 /**< Border address mode */ +} CUaddress_mode; + +/** + * Texture reference filtering modes + */ +typedef enum CUfilter_mode_enum { + CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */ + CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */ +} CUfilter_mode; + +/** + * Device properties + */ +typedef enum CUdevice_attribute_enum { + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, /**< Maximum number of threads per block */ + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, /**< Maximum block dimension X */ + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, /**< Maximum block dimension Y */ + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, /**< Maximum block dimension Z */ + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, /**< Maximum grid dimension X */ + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, /**< Maximum grid dimension Y */ + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, /**< Maximum grid dimension Z */ + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, /**< Maximum shared memory available per block in bytes */ + CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */ + CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */ + CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, /**< Warp size in threads */ + CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, /**< Maximum pitch in bytes allowed by memory copies */ + CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, /**< Maximum number of 32-bit registers available per block */ + CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */ + CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, /**< Typical clock frequency in kilohertz */ + CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, /**< Alignment requirement for textures */ + CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */ + CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, /**< Number of multiprocessors on device */ + CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, /**< Specifies whether there is a run time limit on kernels */ + CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, /**< Device is integrated with host memory */ + CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, /**< Device can map host memory into CUDA address space */ + CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, /**< Compute mode (See ::CUcomputemode for details) */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, /**< Maximum 1D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, /**< Maximum 2D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, /**< Maximum 2D texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, /**< Maximum 3D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, /**< Maximum 3D texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, /**< Maximum 3D texture depth */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27, /**< Maximum 2D layered texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28, /**< Maximum 2D layered texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29, /**< Maximum layers in a 2D layered texture */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */ + CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, /**< Alignment requirement for surfaces */ + CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, /**< Device can possibly execute multiple kernels concurrently */ + CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, /**< Device has ECC support enabled */ + CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, /**< PCI bus ID of the device */ + CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, /**< PCI device ID of the device */ + CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, /**< Device is using TCC driver model */ + CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, /**< Peak memory clock frequency in kilohertz */ + CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, /**< Global memory bus width in bits */ + CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, /**< Size of L2 cache in bytes */ + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, /**< Maximum resident threads per multiprocessor */ + CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, /**< Number of asynchronous engines */ + CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, /**< Device shares a unified address space with the host */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, /**< Maximum 1D layered texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, /**< Maximum layers in a 1D layered texture */ + CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44, /**< Deprecated, do not use. */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45, /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46, /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, /**< Alternate maximum 3D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,/**< Alternate maximum 3D texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, /**< Alternate maximum 3D texture depth */ + CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50, /**< PCI domain ID of the device */ + CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51, /**< Pitch alignment requirement for textures */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52, /**< Maximum cubemap texture width/height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53, /**< Maximum cubemap layered texture width/height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, /**< Maximum layers in a cubemap layered texture */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55, /**< Maximum 1D surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56, /**< Maximum 2D surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57, /**< Maximum 2D surface height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58, /**< Maximum 3D surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59, /**< Maximum 3D surface height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60, /**< Maximum 3D surface depth */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61, /**< Maximum 1D layered surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62, /**< Maximum layers in a 1D layered surface */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63, /**< Maximum 2D layered surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64, /**< Maximum 2D layered surface height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65, /**< Maximum layers in a 2D layered surface */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66, /**< Maximum cubemap surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67, /**< Maximum cubemap layered surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, /**< Maximum layers in a cubemap layered surface */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69, /**< Maximum 1D linear texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70, /**< Maximum 2D linear texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71, /**< Maximum 2D linear texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72, /**< Maximum 2D linear texture pitch in bytes */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, /**< Maximum mipmapped 2D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,/**< Maximum mipmapped 2D texture height */ + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, /**< Major compute capability version number */ + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, /**< Minor compute capability version number */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, /**< Maximum mipmapped 1D texture width */ + CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78, /**< Device supports stream priorities */ + CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79, /**< Device supports caching globals in L1 */ + CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80, /**< Device supports caching locals in L1 */ + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81, /**< Maximum shared memory available per multiprocessor in bytes */ + CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, /**< Maximum number of 32-bit registers available per multiprocessor */ + CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83, /**< Device can allocate managed memory on this system */ + CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84, /**< Device is on a multi-GPU board */ + CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85, /**< Unique id for a group of devices on the same multi-GPU board */ + CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86, /**< Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/ + CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87, /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */ + CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88, /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */ + CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89, /**< Device can coherently access managed memory concurrently with the CPU */ + CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90, /**< Device supports compute preemption. */ + CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, /**< Device can access host registered memory at the same virtual address as the CPU */ + CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92, /**< ::cuStreamBatchMemOp and related APIs are supported. */ + CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93, /**< 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */ + CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94, /**< ::CU_STREAM_WAIT_VALUE_NOR is supported. */ + CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95, /**< Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */ + CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96, /**< Device can participate in cooperative kernels launched via ::cuLaunchCooperativeKernelMultiDevice */ + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, /**< Maximum optin shared memory per block */ + CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98, /**< Both the ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */ + CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99, /**< Device supports host memory registration via ::cudaHostRegister. */ + CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, /**< Device accesses pageable memory via the host's page tables. */ + CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, /**< The host can directly access managed memory on the device without migration. */ + CU_DEVICE_ATTRIBUTE_MAX +} CUdevice_attribute; + +/** + * Legacy device properties + */ +typedef struct CUdevprop_st { + int maxThreadsPerBlock; /**< Maximum number of threads per block */ + int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */ + int maxGridSize[3]; /**< Maximum size of each dimension of a grid */ + int sharedMemPerBlock; /**< Shared memory available per block in bytes */ + int totalConstantMemory; /**< Constant memory available on device in bytes */ + int SIMDWidth; /**< Warp size in threads */ + int memPitch; /**< Maximum pitch in bytes allowed by memory copies */ + int regsPerBlock; /**< 32-bit registers available per block */ + int clockRate; /**< Clock frequency in kilohertz */ + int textureAlign; /**< Alignment requirement for textures */ +} CUdevprop; + +/** + * Pointer information + */ +typedef enum CUpointer_attribute_enum { + CU_POINTER_ATTRIBUTE_CONTEXT = 1, /**< The ::CUcontext on which a pointer was allocated or registered */ + CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2, /**< The ::CUmemorytype describing the physical location of a pointer */ + CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3, /**< The address at which a pointer's memory may be accessed on the device */ + CU_POINTER_ATTRIBUTE_HOST_POINTER = 4, /**< The address at which a pointer's memory may be accessed on the host */ + CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5, /**< A pair of tokens for use with the nv-p2p.h Linux kernel interface */ + CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6, /**< Synchronize every synchronous memory operation initiated on this region */ + CU_POINTER_ATTRIBUTE_BUFFER_ID = 7, /**< A process-wide unique ID for an allocated memory region*/ + CU_POINTER_ATTRIBUTE_IS_MANAGED = 8, /**< Indicates if the pointer points to managed memory */ + CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9 /**< A device ordinal of a device on which a pointer was allocated or registered */ +} CUpointer_attribute; + +/** + * Function properties + */ +typedef enum CUfunction_attribute_enum { + /** + * The maximum number of threads per block, beyond which a launch of the + * function would fail. This number depends on both the function and the + * device on which the function is currently loaded. + */ + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, + + /** + * The size in bytes of statically-allocated shared memory required by + * this function. This does not include dynamically-allocated shared + * memory requested by the user at runtime. + */ + CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1, + + /** + * The size in bytes of user-allocated constant memory required by this + * function. + */ + CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2, + + /** + * The size in bytes of local memory used by each thread of this function. + */ + CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3, + + /** + * The number of registers used by each thread of this function. + */ + CU_FUNC_ATTRIBUTE_NUM_REGS = 4, + + /** + * The PTX virtual architecture version for which the function was + * compiled. This value is the major PTX version * 10 + the minor PTX + * version, so a PTX version 1.3 function would return the value 13. + * Note that this may return the undefined value of 0 for cubins + * compiled prior to CUDA 3.0. + */ + CU_FUNC_ATTRIBUTE_PTX_VERSION = 5, + + /** + * The binary architecture version for which the function was compiled. + * This value is the major binary version * 10 + the minor binary version, + * so a binary version 1.3 function would return the value 13. Note that + * this will return a value of 10 for legacy cubins that do not have a + * properly-encoded binary architecture version. + */ + CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6, + + /** + * The attribute to indicate whether the function has been compiled with + * user specified option "-Xptxas --dlcm=ca" set . + */ + CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7, + + /** + * The maximum size in bytes of dynamically-allocated shared memory that can be used by + * this function. If the user-specified dynamic shared memory size is larger than this + * value, the launch will fail. + * See ::cuFuncSetAttribute + */ + CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8, + + /** + * On devices where the L1 cache and shared memory use the same hardware resources, + * this sets the shared memory carveout preference, in percent of the total shared memory. + * Refer to ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR. + * This is only a hint, and the driver can choose a different ratio if required to execute the function. + * See ::cuFuncSetAttribute + */ + CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9, + + CU_FUNC_ATTRIBUTE_MAX +} CUfunction_attribute; + +/** + * Function cache configurations + */ +typedef enum CUfunc_cache_enum { + CU_FUNC_CACHE_PREFER_NONE = 0x00, /**< no preference for shared memory or L1 (default) */ + CU_FUNC_CACHE_PREFER_SHARED = 0x01, /**< prefer larger shared memory and smaller L1 cache */ + CU_FUNC_CACHE_PREFER_L1 = 0x02, /**< prefer larger L1 cache and smaller shared memory */ + CU_FUNC_CACHE_PREFER_EQUAL = 0x03 /**< prefer equal sized L1 cache and shared memory */ +} CUfunc_cache; + +/** + * Shared memory configurations + */ +typedef enum CUsharedconfig_enum { + CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE = 0x00, /**< set default shared memory bank size */ + CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE = 0x01, /**< set shared memory bank width to four bytes */ + CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02 /**< set shared memory bank width to eight bytes */ +} CUsharedconfig; + +/** + * Shared memory carveout configurations. These may be passed to ::cuFuncSetAttribute + */ +typedef enum CUshared_carveout_enum { + CU_SHAREDMEM_CARVEOUT_DEFAULT = -1, /**< No preference for shared memory or L1 (default) */ + CU_SHAREDMEM_CARVEOUT_MAX_SHARED = 100, /**< Prefer maximum available shared memory, minimum L1 cache */ + CU_SHAREDMEM_CARVEOUT_MAX_L1 = 0 /**< Prefer maximum available L1 cache, minimum shared memory */ +} CUshared_carveout; + +/** + * Memory types + */ +typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, /**< Host memory */ + CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */ + CU_MEMORYTYPE_ARRAY = 0x03, /**< Array memory */ + CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */ +} CUmemorytype; + +/** + * Compute Modes + */ +typedef enum CUcomputemode_enum { + CU_COMPUTEMODE_DEFAULT = 0, /**< Default compute mode (Multiple contexts allowed per device) */ + CU_COMPUTEMODE_PROHIBITED = 2, /**< Compute-prohibited mode (No contexts can be created on this device at this time) */ + CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */ +} CUcomputemode; + +/** + * Memory advise values + */ +typedef enum CUmem_advise_enum { + CU_MEM_ADVISE_SET_READ_MOSTLY = 1, /**< Data will mostly be read and only occasionally be written to */ + CU_MEM_ADVISE_UNSET_READ_MOSTLY = 2, /**< Undo the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY */ + CU_MEM_ADVISE_SET_PREFERRED_LOCATION = 3, /**< Set the preferred location for the data as the specified device */ + CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4, /**< Clear the preferred location for the data */ + CU_MEM_ADVISE_SET_ACCESSED_BY = 5, /**< Data will be accessed by the specified device, so prevent page faults as much as possible */ + CU_MEM_ADVISE_UNSET_ACCESSED_BY = 6 /**< Let the Unified Memory subsystem decide on the page faulting policy for the specified device */ +} CUmem_advise; + +typedef enum CUmem_range_attribute_enum { + CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY = 1, /**< Whether the range will mostly be read and only occasionally be written to */ + CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION = 2, /**< The preferred location of the range */ + CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY = 3, /**< Memory range has ::CU_MEM_ADVISE_SET_ACCESSED_BY set for specified device */ + CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4 /**< The last location to which the range was prefetched */ +} CUmem_range_attribute; + +/** + * Online compiler and linker options + */ +typedef enum CUjit_option_enum +{ + /** + * Max number of registers that a thread may use.\n + * Option type: unsigned int\n + * Applies to: compiler only + */ + CU_JIT_MAX_REGISTERS = 0, + + /** + * IN: Specifies minimum number of threads per block to target compilation + * for\n + * OUT: Returns the number of threads the compiler actually targeted. + * This restricts the resource utilization fo the compiler (e.g. max + * registers) such that a block with the given number of threads should be + * able to launch based on register limitations. Note, this option does not + * currently take into account any other resource limitations, such as + * shared memory utilization.\n + * Cannot be combined with ::CU_JIT_TARGET.\n + * Option type: unsigned int\n + * Applies to: compiler only + */ + CU_JIT_THREADS_PER_BLOCK, + + /** + * Overwrites the option value with the total wall clock time, in + * milliseconds, spent in the compiler and linker\n + * Option type: float\n + * Applies to: compiler and linker + */ + CU_JIT_WALL_TIME, + + /** + * Pointer to a buffer in which to print any log messages + * that are informational in nature (the buffer size is specified via + * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)\n + * Option type: char *\n + * Applies to: compiler and linker + */ + CU_JIT_INFO_LOG_BUFFER, + + /** + * IN: Log buffer size in bytes. Log messages will be capped at this size + * (including null terminator)\n + * OUT: Amount of log buffer filled with messages\n + * Option type: unsigned int\n + * Applies to: compiler and linker + */ + CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, + + /** + * Pointer to a buffer in which to print any log messages that + * reflect errors (the buffer size is specified via option + * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n + * Option type: char *\n + * Applies to: compiler and linker + */ + CU_JIT_ERROR_LOG_BUFFER, + + /** + * IN: Log buffer size in bytes. Log messages will be capped at this size + * (including null terminator)\n + * OUT: Amount of log buffer filled with messages\n + * Option type: unsigned int\n + * Applies to: compiler and linker + */ + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, + + /** + * Level of optimizations to apply to generated code (0 - 4), with 4 + * being the default and highest level of optimizations.\n + * Option type: unsigned int\n + * Applies to: compiler only + */ + CU_JIT_OPTIMIZATION_LEVEL, + + /** + * No option value required. Determines the target based on the current + * attached context (default)\n + * Option type: No option value needed\n + * Applies to: compiler and linker + */ + CU_JIT_TARGET_FROM_CUCONTEXT, + + /** + * Target is chosen based on supplied ::CUjit_target. Cannot be + * combined with ::CU_JIT_THREADS_PER_BLOCK.\n + * Option type: unsigned int for enumerated type ::CUjit_target\n + * Applies to: compiler and linker + */ + CU_JIT_TARGET, + + /** + * Specifies choice of fallback strategy if matching cubin is not found. + * Choice is based on supplied ::CUjit_fallback. This option cannot be + * used with cuLink* APIs as the linker requires exact matches.\n + * Option type: unsigned int for enumerated type ::CUjit_fallback\n + * Applies to: compiler only + */ + CU_JIT_FALLBACK_STRATEGY, + + /** + * Specifies whether to create debug information in output (-g) + * (0: false, default)\n + * Option type: int\n + * Applies to: compiler and linker + */ + CU_JIT_GENERATE_DEBUG_INFO, + + /** + * Generate verbose log messages (0: false, default)\n + * Option type: int\n + * Applies to: compiler and linker + */ + CU_JIT_LOG_VERBOSE, + + /** + * Generate line number information (-lineinfo) (0: false, default)\n + * Option type: int\n + * Applies to: compiler only + */ + CU_JIT_GENERATE_LINE_INFO, + + /** + * Specifies whether to enable caching explicitly (-dlcm) \n + * Choice is based on supplied ::CUjit_cacheMode_enum.\n + * Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum\n + * Applies to: compiler only + */ + CU_JIT_CACHE_MODE, + + /** + * The below jit options are used for internal purposes only, in this version of CUDA + */ + CU_JIT_NEW_SM3X_OPT, + CU_JIT_FAST_COMPILE, + + /** + * Array of device symbol names that will be relocated to the corresponing + * host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES.\n + * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n + * When loding a device module, driver will relocate all encountered + * unresolved symbols to the host addresses.\n + * It is only allowed to register symbols that correspond to unresolved + * global variables.\n + * It is illegal to register the same device symbol at multiple addresses.\n + * Option type: const char **\n + * Applies to: dynamic linker only + */ + CU_JIT_GLOBAL_SYMBOL_NAMES, + + /** + * Array of host addresses that will be used to relocate corresponding + * device symbols stored in ::CU_JIT_GLOBAL_SYMBOL_NAMES.\n + * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n + * Option type: void **\n + * Applies to: dynamic linker only + */ + CU_JIT_GLOBAL_SYMBOL_ADDRESSES, + + /** + * Number of entries in ::CU_JIT_GLOBAL_SYMBOL_NAMES and + * ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES arrays.\n + * Option type: unsigned int\n + * Applies to: dynamic linker only + */ + CU_JIT_GLOBAL_SYMBOL_COUNT, + + CU_JIT_NUM_OPTIONS + +} CUjit_option; + +/** + * Online compilation targets + */ +typedef enum CUjit_target_enum +{ + CU_TARGET_COMPUTE_20 = 20, /**< Compute device class 2.0 */ + CU_TARGET_COMPUTE_21 = 21, /**< Compute device class 2.1 */ + CU_TARGET_COMPUTE_30 = 30, /**< Compute device class 3.0 */ + CU_TARGET_COMPUTE_32 = 32, /**< Compute device class 3.2 */ + CU_TARGET_COMPUTE_35 = 35, /**< Compute device class 3.5 */ + CU_TARGET_COMPUTE_37 = 37, /**< Compute device class 3.7 */ + CU_TARGET_COMPUTE_50 = 50, /**< Compute device class 5.0 */ + CU_TARGET_COMPUTE_52 = 52, /**< Compute device class 5.2 */ + CU_TARGET_COMPUTE_53 = 53, /**< Compute device class 5.3 */ + CU_TARGET_COMPUTE_60 = 60, /**< Compute device class 6.0.*/ + CU_TARGET_COMPUTE_61 = 61, /**< Compute device class 6.1.*/ + CU_TARGET_COMPUTE_62 = 62, /**< Compute device class 6.2.*/ + CU_TARGET_COMPUTE_70 = 70, /**< Compute device class 7.0.*/ + CU_TARGET_COMPUTE_72 = 72, /**< Compute device class 7.2.*/ + CU_TARGET_COMPUTE_75 = 75 /**< Compute device class 7.5.*/ +} CUjit_target; + +/** + * Cubin matching fallback strategies + */ +typedef enum CUjit_fallback_enum +{ + CU_PREFER_PTX = 0, /**< Prefer to compile ptx if exact binary match not found */ + + CU_PREFER_BINARY /**< Prefer to fall back to compatible binary code if exact match not found */ + +} CUjit_fallback; + +/** + * Caching modes for dlcm + */ +typedef enum CUjit_cacheMode_enum +{ + CU_JIT_CACHE_OPTION_NONE = 0, /**< Compile with no -dlcm flag specified */ + CU_JIT_CACHE_OPTION_CG, /**< Compile with L1 cache disabled */ + CU_JIT_CACHE_OPTION_CA /**< Compile with L1 cache enabled */ +} CUjit_cacheMode; + +/** + * Device code formats + */ +typedef enum CUjitInputType_enum +{ + /** + * Compiled device-class-specific device code\n + * Applicable options: none + */ + CU_JIT_INPUT_CUBIN = 0, + + /** + * PTX source code\n + * Applicable options: PTX compiler options + */ + CU_JIT_INPUT_PTX, + + /** + * Bundle of multiple cubins and/or PTX of some device code\n + * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY + */ + CU_JIT_INPUT_FATBINARY, + + /** + * Host object with embedded device code\n + * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY + */ + CU_JIT_INPUT_OBJECT, + + /** + * Archive of host objects with embedded device code\n + * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY + */ + CU_JIT_INPUT_LIBRARY, + + CU_JIT_NUM_INPUT_TYPES +} CUjitInputType; + +#if __CUDA_API_VERSION >= 5050 +typedef struct CUlinkState_st *CUlinkState; +#endif /* __CUDA_API_VERSION >= 5050 */ + +/** + * Flags to register a graphics resource + */ +typedef enum CUgraphicsRegisterFlags_enum { + CU_GRAPHICS_REGISTER_FLAGS_NONE = 0x00, + CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01, + CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02, + CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST = 0x04, + CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER = 0x08 +} CUgraphicsRegisterFlags; + +/** + * Flags for mapping and unmapping interop resources + */ +typedef enum CUgraphicsMapResourceFlags_enum { + CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00, + CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01, + CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02 +} CUgraphicsMapResourceFlags; + +/** + * Array indices for cube faces + */ +typedef enum CUarray_cubemap_face_enum { + CU_CUBEMAP_FACE_POSITIVE_X = 0x00, /**< Positive X face of cubemap */ + CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, /**< Negative X face of cubemap */ + CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, /**< Positive Y face of cubemap */ + CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, /**< Negative Y face of cubemap */ + CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, /**< Positive Z face of cubemap */ + CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 /**< Negative Z face of cubemap */ +} CUarray_cubemap_face; + +/** + * Limits + */ +typedef enum CUlimit_enum { + CU_LIMIT_STACK_SIZE = 0x00, /**< GPU thread stack size */ + CU_LIMIT_PRINTF_FIFO_SIZE = 0x01, /**< GPU printf FIFO size */ + CU_LIMIT_MALLOC_HEAP_SIZE = 0x02, /**< GPU malloc heap size */ + CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH = 0x03, /**< GPU device runtime launch synchronize depth */ + CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04, /**< GPU device runtime pending launch count */ + CU_LIMIT_MAX_L2_FETCH_GRANULARITY = 0x05, /**< A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint */ + CU_LIMIT_MAX +} CUlimit; + +/** + * Resource types + */ +typedef enum CUresourcetype_enum { + CU_RESOURCE_TYPE_ARRAY = 0x00, /**< Array resoure */ + CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */ + CU_RESOURCE_TYPE_LINEAR = 0x02, /**< Linear resource */ + CU_RESOURCE_TYPE_PITCH2D = 0x03 /**< Pitch 2D resource */ +} CUresourcetype; + +#ifdef _WIN32 +#define CUDA_CB __stdcall +#else +#define CUDA_CB +#endif + +#if __CUDA_API_VERSION >= 10000 + +/** + * CUDA host function + * \param userData Argument value passed to the function + */ +typedef void (CUDA_CB *CUhostFn)(void *userData); + +/** + * GPU kernel node parameters + */ +typedef struct CUDA_KERNEL_NODE_PARAMS_st { + CUfunction func; /**< Kernel to launch */ + unsigned int gridDimX; /**< Width of grid in blocks */ + unsigned int gridDimY; /**< Height of grid in blocks */ + unsigned int gridDimZ; /**< Depth of grid in blocks */ + unsigned int blockDimX; /**< X dimension of each thread block */ + unsigned int blockDimY; /**< Y dimension of each thread block */ + unsigned int blockDimZ; /**< Z dimension of each thread block */ + unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */ + void **kernelParams; /**< Array of pointers to kernel parameters */ + void **extra; /**< Extra options */ +} CUDA_KERNEL_NODE_PARAMS; + +/** + * Memset node parameters + */ +typedef struct CUDA_MEMSET_NODE_PARAMS_st { + CUdeviceptr dst; /**< Destination device pointer */ + size_t pitch; /**< Pitch of destination device pointer. Unused if height is 1 */ + unsigned int value; /**< Value to be set */ + unsigned int elementSize; /**< Size of each element in bytes. Must be 1, 2, or 4. */ + size_t width; /**< Width in bytes, of the row */ + size_t height; /**< Number of rows */ +} CUDA_MEMSET_NODE_PARAMS; + +/** + * Host node parameters + */ +typedef struct CUDA_HOST_NODE_PARAMS_st { + CUhostFn fn; /**< The function to call when the node executes */ + void* userData; /**< Argument to pass to the function */ +} CUDA_HOST_NODE_PARAMS; + +/** + * Graph node types + */ +typedef enum CUgraphNodeType_enum { + CU_GRAPH_NODE_TYPE_KERNEL = 0, /**< GPU kernel node */ + CU_GRAPH_NODE_TYPE_MEMCPY = 1, /**< Memcpy node */ + CU_GRAPH_NODE_TYPE_MEMSET = 2, /**< Memset node */ + CU_GRAPH_NODE_TYPE_HOST = 3, /**< Host (executable) node */ + CU_GRAPH_NODE_TYPE_GRAPH = 4, /**< Node which executes an embedded graph */ + CU_GRAPH_NODE_TYPE_EMPTY = 5, /**< Empty (no-op) node */ + CU_GRAPH_NODE_TYPE_COUNT +} CUgraphNodeType; + +/** + * Possible stream capture statuses returned by ::cuStreamIsCapturing + */ +typedef enum CUstreamCaptureStatus_enum { + CU_STREAM_CAPTURE_STATUS_NONE = 0, /**< Stream is not capturing */ + CU_STREAM_CAPTURE_STATUS_ACTIVE = 1, /**< Stream is actively capturing */ + CU_STREAM_CAPTURE_STATUS_INVALIDATED = 2 /**< Stream is part of a capture sequence that + has been invalidated, but not terminated */ +} CUstreamCaptureStatus; + +#endif /* __CUDA_API_VERSION >= 10000 */ + +#if __CUDA_API_VERSION >= 10010 + +/** + * Possible modes for stream capture thread interactions. For more details see + * ::cuStreamBeginCapture and ::cuThreadExchangeStreamCaptureMode + */ +typedef enum CUstreamCaptureMode_enum { + CU_STREAM_CAPTURE_MODE_GLOBAL = 0, + CU_STREAM_CAPTURE_MODE_THREAD_LOCAL = 1, + CU_STREAM_CAPTURE_MODE_RELAXED = 2 +} CUstreamCaptureMode; + +#endif /* __CUDA_API_VERSION >= 10010 */ + +/** + * Error codes + */ +typedef enum cudaError_enum { + /** + * The API call returned with no errors. In the case of query calls, this + * also means that the operation being queried is complete (see + * ::cuEventQuery() and ::cuStreamQuery()). + */ + CUDA_SUCCESS = 0, + + /** + * This indicates that one or more of the parameters passed to the API call + * is not within an acceptable range of values. + */ + CUDA_ERROR_INVALID_VALUE = 1, + + /** + * The API call failed because it was unable to allocate enough memory to + * perform the requested operation. + */ + CUDA_ERROR_OUT_OF_MEMORY = 2, + + /** + * This indicates that the CUDA driver has not been initialized with + * ::cuInit() or that initialization has failed. + */ + CUDA_ERROR_NOT_INITIALIZED = 3, + + /** + * This indicates that the CUDA driver is in the process of shutting down. + */ + CUDA_ERROR_DEINITIALIZED = 4, + + /** + * This indicates profiler is not initialized for this run. This can + * happen when the application is running with external profiling tools + * like visual profiler. + */ + CUDA_ERROR_PROFILER_DISABLED = 5, + + /** + * \deprecated + * This error return is deprecated as of CUDA 5.0. It is no longer an error + * to attempt to enable/disable the profiling via ::cuProfilerStart or + * ::cuProfilerStop without initialization. + */ + CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6, + + /** + * \deprecated + * This error return is deprecated as of CUDA 5.0. It is no longer an error + * to call cuProfilerStart() when profiling is already enabled. + */ + CUDA_ERROR_PROFILER_ALREADY_STARTED = 7, + + /** + * \deprecated + * This error return is deprecated as of CUDA 5.0. It is no longer an error + * to call cuProfilerStop() when profiling is already disabled. + */ + CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8, + + /** + * This indicates that no CUDA-capable devices were detected by the installed + * CUDA driver. + */ + CUDA_ERROR_NO_DEVICE = 100, + + /** + * This indicates that the device ordinal supplied by the user does not + * correspond to a valid CUDA device. + */ + CUDA_ERROR_INVALID_DEVICE = 101, + + + /** + * This indicates that the device kernel image is invalid. This can also + * indicate an invalid CUDA module. + */ + CUDA_ERROR_INVALID_IMAGE = 200, + + /** + * This most frequently indicates that there is no context bound to the + * current thread. This can also be returned if the context passed to an + * API call is not a valid handle (such as a context that has had + * ::cuCtxDestroy() invoked on it). This can also be returned if a user + * mixes different API versions (i.e. 3010 context with 3020 API calls). + * See ::cuCtxGetApiVersion() for more details. + */ + CUDA_ERROR_INVALID_CONTEXT = 201, + + /** + * This indicated that the context being supplied as a parameter to the + * API call was already the active context. + * \deprecated + * This error return is deprecated as of CUDA 3.2. It is no longer an + * error to attempt to push the active context via ::cuCtxPushCurrent(). + */ + CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, + + /** + * This indicates that a map or register operation has failed. + */ + CUDA_ERROR_MAP_FAILED = 205, + + /** + * This indicates that an unmap or unregister operation has failed. + */ + CUDA_ERROR_UNMAP_FAILED = 206, + + /** + * This indicates that the specified array is currently mapped and thus + * cannot be destroyed. + */ + CUDA_ERROR_ARRAY_IS_MAPPED = 207, + + /** + * This indicates that the resource is already mapped. + */ + CUDA_ERROR_ALREADY_MAPPED = 208, + + /** + * This indicates that there is no kernel image available that is suitable + * for the device. This can occur when a user specifies code generation + * options for a particular CUDA source file that do not include the + * corresponding device configuration. + */ + CUDA_ERROR_NO_BINARY_FOR_GPU = 209, + + /** + * This indicates that a resource has already been acquired. + */ + CUDA_ERROR_ALREADY_ACQUIRED = 210, + + /** + * This indicates that a resource is not mapped. + */ + CUDA_ERROR_NOT_MAPPED = 211, + + /** + * This indicates that a mapped resource is not available for access as an + * array. + */ + CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212, + + /** + * This indicates that a mapped resource is not available for access as a + * pointer. + */ + CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213, + + /** + * This indicates that an uncorrectable ECC error was detected during + * execution. + */ + CUDA_ERROR_ECC_UNCORRECTABLE = 214, + + /** + * This indicates that the ::CUlimit passed to the API call is not + * supported by the active device. + */ + CUDA_ERROR_UNSUPPORTED_LIMIT = 215, + + /** + * This indicates that the ::CUcontext passed to the API call can + * only be bound to a single CPU thread at a time but is already + * bound to a CPU thread. + */ + CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216, + + /** + * This indicates that peer access is not supported across the given + * devices. + */ + CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217, + + /** + * This indicates that a PTX JIT compilation failed. + */ + CUDA_ERROR_INVALID_PTX = 218, + + /** + * This indicates an error with OpenGL or DirectX context. + */ + CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219, + + /** + * This indicates that an uncorrectable NVLink error was detected during the + * execution. + */ + CUDA_ERROR_NVLINK_UNCORRECTABLE = 220, + + /** + * This indicates that the PTX JIT compiler library was not found. + */ + CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221, + + /** + * This indicates that the device kernel source is invalid. + */ + CUDA_ERROR_INVALID_SOURCE = 300, + + /** + * This indicates that the file specified was not found. + */ + CUDA_ERROR_FILE_NOT_FOUND = 301, + + /** + * This indicates that a link to a shared object failed to resolve. + */ + CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302, + + /** + * This indicates that initialization of a shared object failed. + */ + CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303, + + /** + * This indicates that an OS call failed. + */ + CUDA_ERROR_OPERATING_SYSTEM = 304, + + /** + * This indicates that a resource handle passed to the API call was not + * valid. Resource handles are opaque types like ::CUstream and ::CUevent. + */ + CUDA_ERROR_INVALID_HANDLE = 400, + + /** + * This indicates that a resource required by the API call is not in a + * valid state to perform the requested operation. + */ + CUDA_ERROR_ILLEGAL_STATE = 401, + + /** + * This indicates that a named symbol was not found. Examples of symbols + * are global/constant variable names, texture names, and surface names. + */ + CUDA_ERROR_NOT_FOUND = 500, + + /** + * This indicates that asynchronous operations issued previously have not + * completed yet. This result is not actually an error, but must be indicated + * differently than ::CUDA_SUCCESS (which indicates completion). Calls that + * may return this value include ::cuEventQuery() and ::cuStreamQuery(). + */ + CUDA_ERROR_NOT_READY = 600, + + /** + * While executing a kernel, the device encountered a + * load or store instruction on an invalid memory address. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_ILLEGAL_ADDRESS = 700, + + /** + * This indicates that a launch did not occur because it did not have + * appropriate resources. This error usually indicates that the user has + * attempted to pass too many arguments to the device kernel, or the + * kernel launch specifies too many threads for the kernel's register + * count. Passing arguments of the wrong size (i.e. a 64-bit pointer + * when a 32-bit int is expected) is equivalent to passing too many + * arguments and can also result in this error. + */ + CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, + + /** + * This indicates that the device kernel took too long to execute. This can + * only occur if timeouts are enabled - see the device attribute + * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_LAUNCH_TIMEOUT = 702, + + /** + * This error indicates a kernel launch that uses an incompatible texturing + * mode. + */ + CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, + + /** + * This error indicates that a call to ::cuCtxEnablePeerAccess() is + * trying to re-enable peer access to a context which has already + * had peer access to it enabled. + */ + CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704, + + /** + * This error indicates that ::cuCtxDisablePeerAccess() is + * trying to disable peer access which has not been enabled yet + * via ::cuCtxEnablePeerAccess(). + */ + CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705, + + /** + * This error indicates that the primary context for the specified device + * has already been initialized. + */ + CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708, + + /** + * This error indicates that the context current to the calling thread + * has been destroyed using ::cuCtxDestroy, or is a primary context which + * has not yet been initialized. + */ + CUDA_ERROR_CONTEXT_IS_DESTROYED = 709, + + /** + * A device-side assert triggered during kernel execution. The context + * cannot be used anymore, and must be destroyed. All existing device + * memory allocations from this context are invalid and must be + * reconstructed if the program is to continue using CUDA. + */ + CUDA_ERROR_ASSERT = 710, + + /** + * This error indicates that the hardware resources required to enable + * peer access have been exhausted for one or more of the devices + * passed to ::cuCtxEnablePeerAccess(). + */ + CUDA_ERROR_TOO_MANY_PEERS = 711, + + /** + * This error indicates that the memory range passed to ::cuMemHostRegister() + * has already been registered. + */ + CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712, + + /** + * This error indicates that the pointer passed to ::cuMemHostUnregister() + * does not correspond to any currently registered memory region. + */ + CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713, + + /** + * While executing a kernel, the device encountered a stack error. + * This can be due to stack corruption or exceeding the stack size limit. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_HARDWARE_STACK_ERROR = 714, + + /** + * While executing a kernel, the device encountered an illegal instruction. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_ILLEGAL_INSTRUCTION = 715, + + /** + * While executing a kernel, the device encountered a load or store instruction + * on a memory address which is not aligned. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_MISALIGNED_ADDRESS = 716, + + /** + * While executing a kernel, the device encountered an instruction + * which can only operate on memory locations in certain address spaces + * (global, shared, or local), but was supplied a memory address not + * belonging to an allowed address space. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_INVALID_ADDRESS_SPACE = 717, + + /** + * While executing a kernel, the device program counter wrapped its address space. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_INVALID_PC = 718, + + /** + * An exception occurred on the device while executing a kernel. Common + * causes include dereferencing an invalid device pointer and accessing + * out of bounds shared memory. Less common cases can be system specific - more + * information about these cases can be found in the system specific user guide. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_LAUNCH_FAILED = 719, + + /** + * This error indicates that the number of blocks launched per grid for a kernel that was + * launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice + * exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor + * or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors + * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. + */ + CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720, + + /** + * This error indicates that the attempted operation is not permitted. + */ + CUDA_ERROR_NOT_PERMITTED = 800, + + /** + * This error indicates that the attempted operation is not supported + * on the current system or device. + */ + CUDA_ERROR_NOT_SUPPORTED = 801, + + /** + * This error indicates that the system is not yet ready to start any CUDA + * work. To continue using CUDA, verify the system configuration is in a + * valid state and all required driver daemons are actively running. + * More information about this error can be found in the system specific + * user guide. + */ + CUDA_ERROR_SYSTEM_NOT_READY = 802, + + /** + * This error indicates that there is a mismatch between the versions of + * the display driver and the CUDA driver. Refer to the compatibility documentation + * for supported versions. + */ + CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803, + + /** + * This error indicates that the system was upgraded to run with forward compatibility + * but the visible hardware detected by CUDA does not support this configuration. + * Refer to the compatibility documentation for the supported hardware matrix or ensure + * that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES + * environment variable. + */ + CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804, + + /** + * This error indicates that the operation is not permitted when + * the stream is capturing. + */ + CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900, + + /** + * This error indicates that the current capture sequence on the stream + * has been invalidated due to a previous error. + */ + CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901, + + /** + * This error indicates that the operation would have resulted in a merge + * of two independent capture sequences. + */ + CUDA_ERROR_STREAM_CAPTURE_MERGE = 902, + + /** + * This error indicates that the capture was not initiated in this stream. + */ + CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903, + + /** + * This error indicates that the capture sequence contains a fork that was + * not joined to the primary stream. + */ + CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904, + + /** + * This error indicates that a dependency would have been created which + * crosses the capture sequence boundary. Only implicit in-stream ordering + * dependencies are allowed to cross the boundary. + */ + CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905, + + /** + * This error indicates a disallowed implicit dependency on a current capture + * sequence from cudaStreamLegacy. + */ + CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906, + + /** + * This error indicates that the operation is not permitted on an event which + * was last recorded in a capturing stream. + */ + CUDA_ERROR_CAPTURED_EVENT = 907, + + /** + * A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED + * argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a + * different thread. + */ + CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908, + + /** + * This indicates that an unknown internal error has occurred. + */ + CUDA_ERROR_UNKNOWN = 999 +} CUresult; + +/** + * P2P Attributes + */ +typedef enum CUdevice_P2PAttribute_enum { + CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01, /**< A relative value indicating the performance of the link between two devices */ + CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02, /**< P2P Access is enable */ + CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03, /**< Atomic operation over the link supported */ + CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED = 0x04, /**< \deprecated use CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED instead */ + CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED = 0x04 /**< Accessing CUDA arrays over the link supported */ +} CUdevice_P2PAttribute; + +/** + * CUDA stream callback + * \param hStream The stream the callback was added to, as passed to ::cuStreamAddCallback. May be NULL. + * \param status ::CUDA_SUCCESS or any persistent error on the stream. + * \param userData User parameter provided at registration. + */ +typedef void (CUDA_CB *CUstreamCallback)(CUstream hStream, CUresult status, void *userData); + +/** + * Block size to per-block dynamic shared memory mapping for a certain + * kernel \param blockSize Block size of the kernel. + * + * \return The dynamic shared memory needed by a block. + */ +typedef size_t (CUDA_CB *CUoccupancyB2DSize)(int blockSize); + +/** + * If set, host memory is portable between CUDA contexts. + * Flag for ::cuMemHostAlloc() + */ +#define CU_MEMHOSTALLOC_PORTABLE 0x01 + +/** + * If set, host memory is mapped into CUDA address space and + * ::cuMemHostGetDevicePointer() may be called on the host pointer. + * Flag for ::cuMemHostAlloc() + */ +#define CU_MEMHOSTALLOC_DEVICEMAP 0x02 + +/** + * If set, host memory is allocated as write-combined - fast to write, + * faster to DMA, slow to read except via SSE4 streaming load instruction + * (MOVNTDQA). + * Flag for ::cuMemHostAlloc() + */ +#define CU_MEMHOSTALLOC_WRITECOMBINED 0x04 + +/** + * If set, host memory is portable between CUDA contexts. + * Flag for ::cuMemHostRegister() + */ +#define CU_MEMHOSTREGISTER_PORTABLE 0x01 + +/** + * If set, host memory is mapped into CUDA address space and + * ::cuMemHostGetDevicePointer() may be called on the host pointer. + * Flag for ::cuMemHostRegister() + */ +#define CU_MEMHOSTREGISTER_DEVICEMAP 0x02 + +/** + * If set, the passed memory pointer is treated as pointing to some + * memory-mapped I/O space, e.g. belonging to a third-party PCIe device. + * On Windows the flag is a no-op. + * On Linux that memory is marked as non cache-coherent for the GPU and + * is expected to be physically contiguous. It may return + * CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user, + * CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions. + * On all other platforms, it is not supported and CUDA_ERROR_NOT_SUPPORTED + * is returned. + * Flag for ::cuMemHostRegister() + */ +#define CU_MEMHOSTREGISTER_IOMEMORY 0x04 + +#if __CUDA_API_VERSION >= 3020 + +/** + * 2D memory copy parameters + */ +typedef struct CUDA_MEMCPY2D_st { + size_t srcXInBytes; /**< Source X in bytes */ + size_t srcY; /**< Source Y */ + + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + size_t srcPitch; /**< Source pitch (ignored when src is array) */ + + size_t dstXInBytes; /**< Destination X in bytes */ + size_t dstY; /**< Destination Y */ + + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ + + size_t WidthInBytes; /**< Width of 2D memory copy in bytes */ + size_t Height; /**< Height of 2D memory copy */ +} CUDA_MEMCPY2D; + +/** + * 3D memory copy parameters + */ +typedef struct CUDA_MEMCPY3D_st { + size_t srcXInBytes; /**< Source X in bytes */ + size_t srcY; /**< Source Y */ + size_t srcZ; /**< Source Z */ + size_t srcLOD; /**< Source LOD */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + void *reserved0; /**< Must be NULL */ + size_t srcPitch; /**< Source pitch (ignored when src is array) */ + size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ + + size_t dstXInBytes; /**< Destination X in bytes */ + size_t dstY; /**< Destination Y */ + size_t dstZ; /**< Destination Z */ + size_t dstLOD; /**< Destination LOD */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + void *reserved1; /**< Must be NULL */ + size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ + size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ + + size_t WidthInBytes; /**< Width of 3D memory copy in bytes */ + size_t Height; /**< Height of 3D memory copy */ + size_t Depth; /**< Depth of 3D memory copy */ +} CUDA_MEMCPY3D; + +/** + * 3D memory cross-context copy parameters + */ +typedef struct CUDA_MEMCPY3D_PEER_st { + size_t srcXInBytes; /**< Source X in bytes */ + size_t srcY; /**< Source Y */ + size_t srcZ; /**< Source Z */ + size_t srcLOD; /**< Source LOD */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + CUcontext srcContext; /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */ + size_t srcPitch; /**< Source pitch (ignored when src is array) */ + size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ + + size_t dstXInBytes; /**< Destination X in bytes */ + size_t dstY; /**< Destination Y */ + size_t dstZ; /**< Destination Z */ + size_t dstLOD; /**< Destination LOD */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + CUcontext dstContext; /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */ + size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ + size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ + + size_t WidthInBytes; /**< Width of 3D memory copy in bytes */ + size_t Height; /**< Height of 3D memory copy */ + size_t Depth; /**< Depth of 3D memory copy */ +} CUDA_MEMCPY3D_PEER; + +/** + * Array descriptor + */ +typedef struct CUDA_ARRAY_DESCRIPTOR_st +{ + size_t Width; /**< Width of array */ + size_t Height; /**< Height of array */ + + CUarray_format Format; /**< Array format */ + unsigned int NumChannels; /**< Channels per array element */ +} CUDA_ARRAY_DESCRIPTOR; + +/** + * 3D array descriptor + */ +typedef struct CUDA_ARRAY3D_DESCRIPTOR_st +{ + size_t Width; /**< Width of 3D array */ + size_t Height; /**< Height of 3D array */ + size_t Depth; /**< Depth of 3D array */ + + CUarray_format Format; /**< Array format */ + unsigned int NumChannels; /**< Channels per array element */ + unsigned int Flags; /**< Flags */ +} CUDA_ARRAY3D_DESCRIPTOR; + +#endif /* __CUDA_API_VERSION >= 3020 */ + +#if __CUDA_API_VERSION >= 5000 + +/** + * CUDA Resource descriptor + */ +typedef struct CUDA_RESOURCE_DESC_st +{ + CUresourcetype resType; /**< Resource type */ + + union { + struct { + CUarray hArray; /**< CUDA array */ + } array; + struct { + CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */ + } mipmap; + struct { + CUdeviceptr devPtr; /**< Device pointer */ + CUarray_format format; /**< Array format */ + unsigned int numChannels; /**< Channels per array element */ + size_t sizeInBytes; /**< Size in bytes */ + } linear; + struct { + CUdeviceptr devPtr; /**< Device pointer */ + CUarray_format format; /**< Array format */ + unsigned int numChannels; /**< Channels per array element */ + size_t width; /**< Width of the array in elements */ + size_t height; /**< Height of the array in elements */ + size_t pitchInBytes; /**< Pitch between two rows in bytes */ + } pitch2D; + struct { + int reserved[32]; + } reserved; + } res; + + unsigned int flags; /**< Flags (must be zero) */ +} CUDA_RESOURCE_DESC; + +/** + * Texture descriptor + */ +typedef struct CUDA_TEXTURE_DESC_st { + CUaddress_mode addressMode[3]; /**< Address modes */ + CUfilter_mode filterMode; /**< Filter mode */ + unsigned int flags; /**< Flags */ + unsigned int maxAnisotropy; /**< Maximum anisotropy ratio */ + CUfilter_mode mipmapFilterMode; /**< Mipmap filter mode */ + float mipmapLevelBias; /**< Mipmap level bias */ + float minMipmapLevelClamp; /**< Mipmap minimum level clamp */ + float maxMipmapLevelClamp; /**< Mipmap maximum level clamp */ + float borderColor[4]; /**< Border Color */ + int reserved[12]; +} CUDA_TEXTURE_DESC; + +/** + * Resource view format + */ +typedef enum CUresourceViewFormat_enum +{ + CU_RES_VIEW_FORMAT_NONE = 0x00, /**< No resource view format (use underlying resource format) */ + CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01, /**< 1 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X8 = 0x02, /**< 2 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X8 = 0x03, /**< 4 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X8 = 0x04, /**< 1 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X8 = 0x05, /**< 2 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X8 = 0x06, /**< 4 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_1X16 = 0x07, /**< 1 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X16 = 0x08, /**< 2 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X16 = 0x09, /**< 4 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X16 = 0x0a, /**< 1 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X16 = 0x0b, /**< 2 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X16 = 0x0c, /**< 4 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_1X32 = 0x0d, /**< 1 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X32 = 0x0e, /**< 2 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X32 = 0x0f, /**< 4 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X32 = 0x10, /**< 1 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X32 = 0x11, /**< 2 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X32 = 0x12, /**< 4 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13, /**< 1 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14, /**< 2 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15, /**< 4 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16, /**< 1 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17, /**< 2 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18, /**< 4 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19, /**< Block compressed 1 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a, /**< Block compressed 2 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b, /**< Block compressed 3 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c, /**< Block compressed 4 unsigned */ + CU_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d, /**< Block compressed 4 signed */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e, /**< Block compressed 5 unsigned */ + CU_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f, /**< Block compressed 5 signed */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */ + CU_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21, /**< Block compressed 6 signed half-float */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22 /**< Block compressed 7 */ +} CUresourceViewFormat; + +/** + * Resource view descriptor + */ +typedef struct CUDA_RESOURCE_VIEW_DESC_st +{ + CUresourceViewFormat format; /**< Resource view format */ + size_t width; /**< Width of the resource view */ + size_t height; /**< Height of the resource view */ + size_t depth; /**< Depth of the resource view */ + unsigned int firstMipmapLevel; /**< First defined mipmap level */ + unsigned int lastMipmapLevel; /**< Last defined mipmap level */ + unsigned int firstLayer; /**< First layer index */ + unsigned int lastLayer; /**< Last layer index */ + unsigned int reserved[16]; +} CUDA_RESOURCE_VIEW_DESC; + +/** + * GPU Direct v3 tokens + */ +typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st { + unsigned long long p2pToken; + unsigned int vaSpaceToken; +} CUDA_POINTER_ATTRIBUTE_P2P_TOKENS; + +#endif /* __CUDA_API_VERSION >= 5000 */ + +#if __CUDA_API_VERSION >= 9000 + +/** + * Kernel launch parameters + */ +typedef struct CUDA_LAUNCH_PARAMS_st { + CUfunction function; /**< Kernel to launch */ + unsigned int gridDimX; /**< Width of grid in blocks */ + unsigned int gridDimY; /**< Height of grid in blocks */ + unsigned int gridDimZ; /**< Depth of grid in blocks */ + unsigned int blockDimX; /**< X dimension of each thread block */ + unsigned int blockDimY; /**< Y dimension of each thread block */ + unsigned int blockDimZ; /**< Z dimension of each thread block */ + unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */ + CUstream hStream; /**< Stream identifier */ + void **kernelParams; /**< Array of pointers to kernel parameters */ +} CUDA_LAUNCH_PARAMS; + +#endif /* __CUDA_API_VERSION >= 9000 */ + +#if __CUDA_API_VERSION >= 10000 + +/** + * External memory handle types + */ +typedef enum CUexternalMemoryHandleType_enum { + /** + * Handle is an opaque file descriptor + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1, + /** + * Handle is an opaque shared NT handle + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 = 2, + /** + * Handle is an opaque, globally shared handle + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, + /** + * Handle is a D3D12 heap object + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP = 4, + /** + * Handle is a D3D12 committed resource + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE = 5 +} CUexternalMemoryHandleType; + +/** + * Indicates that the external memory object is a dedicated resource + */ +#define CUDA_EXTERNAL_MEMORY_DEDICATED 0x1 + +/** + * External memory handle descriptor + */ +typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st { + /** + * Type of the handle + */ + CUexternalMemoryHandleType type; + union { + /** + * File descriptor referencing the memory object. Valid + * when type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD + */ + int fd; + /** + * Win32 handle referencing the semaphore object. Valid when + * type is one of the following: + * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 + * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT + * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP + * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE + * Exactly one of 'handle' and 'name' must be non-NULL. If + * type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT + * then 'name' must be NULL. + */ + struct { + /** + * Valid NT handle. Must be NULL if 'name' is non-NULL + */ + void *handle; + /** + * Name of a valid memory object. + * Must be NULL if 'handle' is non-NULL. + */ + const void *name; + } win32; + } handle; + /** + * Size of the memory allocation + */ + unsigned long long size; + /** + * Flags must either be zero or ::CUDA_EXTERNAL_MEMORY_DEDICATED + */ + unsigned int flags; + unsigned int reserved[16]; +} CUDA_EXTERNAL_MEMORY_HANDLE_DESC; + +/** + * External memory buffer descriptor + */ +typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st { + /** + * Offset into the memory object where the buffer's base is + */ + unsigned long long offset; + /** + * Size of the buffer + */ + unsigned long long size; + /** + * Flags reserved for future use. Must be zero. + */ + unsigned int flags; + unsigned int reserved[16]; +} CUDA_EXTERNAL_MEMORY_BUFFER_DESC; + +/** + * External memory mipmap descriptor + */ +typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st { + /** + * Offset into the memory object where the base level of the + * mipmap chain is. + */ + unsigned long long offset; + /** + * Format, dimension and type of base level of the mipmap chain + */ + CUDA_ARRAY3D_DESCRIPTOR arrayDesc; + /** + * Total number of levels in the mipmap chain + */ + unsigned int numLevels; + unsigned int reserved[16]; +} CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC; + +/** + * External semaphore handle types + */ +typedef enum CUexternalSemaphoreHandleType_enum { + /** + * Handle is an opaque file descriptor + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD = 1, + /** + * Handle is an opaque shared NT handle + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 = 2, + /** + * Handle is an opaque, globally shared handle + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, + /** + * Handle is a shared NT handle referencing a D3D12 fence object + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE = 4 +} CUexternalSemaphoreHandleType; + +/** + * External semaphore handle descriptor + */ +typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st { + /** + * Type of the handle + */ + CUexternalSemaphoreHandleType type; + union { + /** + * File descriptor referencing the semaphore object. Valid + * when type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD + */ + int fd; + /** + * Win32 handle referencing the semaphore object. Valid when + * type is one of the following: + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE + * Exactly one of 'handle' and 'name' must be non-NULL. If + * type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT + * then 'name' must be NULL. + */ + struct { + /** + * Valid NT handle. Must be NULL if 'name' is non-NULL + */ + void *handle; + /** + * Name of a valid synchronization primitive. + * Must be NULL if 'handle' is non-NULL. + */ + const void *name; + } win32; + } handle; + /** + * Flags reserved for the future. Must be zero. + */ + unsigned int flags; + unsigned int reserved[16]; +} CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC; + +/** + * External semaphore signal parameters + */ +typedef struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st { + struct { + /** + * Parameters for fence objects + */ + struct { + /** + * Value of fence to be signaled + */ + unsigned long long value; + } fence; + unsigned int reserved[16]; + } params; + /** + * Flags reserved for the future. Must be zero. + */ + unsigned int flags; + unsigned int reserved[16]; +} CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS; + +/** + * External semaphore wait parameters + */ +typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st { + struct { + /** + * Parameters for fence objects + */ + struct { + /** + * Value of fence to be waited on + */ + unsigned long long value; + } fence; + unsigned int reserved[16]; + } params; + /** + * Flags reserved for the future. Must be zero. + */ + unsigned int flags; + unsigned int reserved[16]; +} CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS; + + +#endif /* __CUDA_API_VERSION >= 10000 */ + +/** + * If set, each kernel launched as part of ::cuLaunchCooperativeKernelMultiDevice only + * waits for prior work in the stream corresponding to that GPU to complete before the + * kernel begins execution. + */ +#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC 0x01 + +/** + * If set, any subsequent work pushed in a stream that participated in a call to + * ::cuLaunchCooperativeKernelMultiDevice will only wait for the kernel launched on + * the GPU corresponding to that stream to complete before it begins execution. + */ +#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC 0x02 + +/** + * If set, the CUDA array is a collection of layers, where each layer is either a 1D + * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number + * of layers, not the depth of a 3D array. + */ +#define CUDA_ARRAY3D_LAYERED 0x01 + +/** + * Deprecated, use CUDA_ARRAY3D_LAYERED + */ +#define CUDA_ARRAY3D_2DARRAY 0x01 + +/** + * This flag must be set in order to bind a surface reference + * to the CUDA array + */ +#define CUDA_ARRAY3D_SURFACE_LDST 0x02 + +/** + * If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The + * width of such a CUDA array must be equal to its height, and Depth must be six. + * If ::CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps + * and Depth must be a multiple of six. + */ +#define CUDA_ARRAY3D_CUBEMAP 0x04 + +/** + * This flag must be set in order to perform texture gather operations + * on a CUDA array. + */ +#define CUDA_ARRAY3D_TEXTURE_GATHER 0x08 + +/** + * This flag if set indicates that the CUDA + * array is a DEPTH_TEXTURE. + */ +#define CUDA_ARRAY3D_DEPTH_TEXTURE 0x10 + +/** + * This flag indicates that the CUDA array may be bound as a color target + * in an external graphics API + */ +#define CUDA_ARRAY3D_COLOR_ATTACHMENT 0x20 + +/** + * Override the texref format with a format inferred from the array. + * Flag for ::cuTexRefSetArray() + */ +#define CU_TRSA_OVERRIDE_FORMAT 0x01 + +/** + * Read the texture as integers rather than promoting the values to floats + * in the range [0,1]. + * Flag for ::cuTexRefSetFlags() + */ +#define CU_TRSF_READ_AS_INTEGER 0x01 + +/** + * Use normalized texture coordinates in the range [0,1) instead of [0,dim). + * Flag for ::cuTexRefSetFlags() + */ +#define CU_TRSF_NORMALIZED_COORDINATES 0x02 + +/** + * Perform sRGB->linear conversion during texture read. + * Flag for ::cuTexRefSetFlags() + */ +#define CU_TRSF_SRGB 0x10 + +/** + * End of array terminator for the \p extra parameter to + * ::cuLaunchKernel + */ +#define CU_LAUNCH_PARAM_END ((void*)0x00) + +/** + * Indicator that the next value in the \p extra parameter to + * ::cuLaunchKernel will be a pointer to a buffer containing all kernel + * parameters used for launching kernel \p f. This buffer needs to + * honor all alignment/padding requirements of the individual parameters. + * If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the + * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no + * effect. + */ +#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01) + +/** + * Indicator that the next value in the \p extra parameter to + * ::cuLaunchKernel will be a pointer to a size_t which contains the + * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER. + * It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified + * in the \p extra array if the value associated with + * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero. + */ +#define CU_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02) + +/** + * For texture references loaded into the module, use default texunit from + * texture reference. + */ +#define CU_PARAM_TR_DEFAULT -1 + +/** + * Device that represents the CPU + */ +#define CU_DEVICE_CPU ((CUdevice)-1) + +/** + * Device that represents an invalid device + */ +#define CU_DEVICE_INVALID ((CUdevice)-2) + +/** @} */ /* END CUDA_TYPES */ + +#ifdef _WIN32 +#define CUDAAPI __stdcall +#else +#define CUDAAPI +#endif + +/** + * \defgroup CUDA_ERROR Error Handling + * + * ___MANBRIEF___ error handling functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the error handling functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Gets the string description of an error code + * + * Sets \p *pStr to the address of a NULL-terminated string description + * of the error code \p error. + * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE + * will be returned and \p *pStr will be set to the NULL address. + * + * \param error - Error code to convert to string + * \param pStr - Address of the string pointer. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::CUresult, + * ::cudaGetErrorString + */ +CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr); + +/** + * \brief Gets the string representation of an error code enum name + * + * Sets \p *pStr to the address of a NULL-terminated string representation + * of the name of the enum error code \p error. + * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE + * will be returned and \p *pStr will be set to the NULL address. + * + * \param error - Error code to convert to string + * \param pStr - Address of the string pointer. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::CUresult, + * ::cudaGetErrorName + */ +CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr); + +/** @} */ /* END CUDA_ERROR */ + +/** + * \defgroup CUDA_INITIALIZE Initialization + * + * ___MANBRIEF___ initialization functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the initialization functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Initialize the CUDA driver API + * + * Initializes the driver API and must be called before any other function from + * the driver API. Currently, the \p Flags parameter must be 0. If ::cuInit() + * has not been called, any function from the driver API will return + * ::CUDA_ERROR_NOT_INITIALIZED. + * + * \param Flags - Initialization flag for CUDA. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_SYSTEM_DRIVER_MISMATCH, + * ::CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE + * \notefnerr + */ +CUresult CUDAAPI cuInit(unsigned int Flags); + +/** @} */ /* END CUDA_INITIALIZE */ + +/** + * \defgroup CUDA_VERSION Version Management + * + * ___MANBRIEF___ version management functions of the low-level CUDA driver + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the version management functions of the low-level + * CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns the latest CUDA version supported by driver + * + * Returns in \p *driverVersion the version of CUDA supported by + * the driver. The version is returned as + * (1000 × major + 10 × minor). For example, CUDA 9.2 + * would be represented by 9020. + * + * This function automatically returns ::CUDA_ERROR_INVALID_VALUE if + * \p driverVersion is NULL. + * + * \param driverVersion - Returns the CUDA driver version + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cudaDriverGetVersion, + * ::cudaRuntimeGetVersion + */ +CUresult CUDAAPI cuDriverGetVersion(int *driverVersion); + +/** @} */ /* END CUDA_VERSION */ + +/** + * \defgroup CUDA_DEVICE Device Management + * + * ___MANBRIEF___ device management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the device management functions of the low-level + * CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns a handle to a compute device + * + * Returns in \p *device a device handle given an ordinal in the range [0, + * ::cuDeviceGetCount()-1]. + * + * \param device - Returned device handle + * \param ordinal - Device number to get handle for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetUuid, + * ::cuDeviceGetLuid, + * ::cuDeviceTotalMem + */ +CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal); + +/** + * \brief Returns the number of compute-capable devices + * + * Returns in \p *count the number of devices with compute capability greater + * than or equal to 2.0 that are available for execution. If there is no such + * device, ::cuDeviceGetCount() returns 0. + * + * \param count - Returned number of compute-capable devices + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetName, + * ::cuDeviceGetUuid, + * ::cuDeviceGetLuid, + * ::cuDeviceGet, + * ::cuDeviceTotalMem, + * ::cudaGetDeviceCount + */ +CUresult CUDAAPI cuDeviceGetCount(int *count); + +/** + * \brief Returns an identifier string for the device + * + * Returns an ASCII string identifying the device \p dev in the NULL-terminated + * string pointed to by \p name. \p len specifies the maximum length of the + * string that may be returned. + * + * \param name - Returned identifier string for the device + * \param len - Maximum length of string to store in \p name + * \param dev - Device to get identifier string for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetUuid, + * ::cuDeviceGetLuid, + * ::cuDeviceGetCount, + * ::cuDeviceGet, + * ::cuDeviceTotalMem, + * ::cudaGetDeviceProperties + */ +CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev); + +#if __CUDA_API_VERSION >= 9020 +/** + * \brief Return an UUID for the device + * + * Returns 16-octets identifing the device \p dev in the structure + * pointed by the \p uuid. + * + * \param uuid - Returned UUID + * \param dev - Device to get identifier string for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetLuid, + * ::cuDeviceGet, + * ::cuDeviceTotalMem, + * ::cudaGetDeviceProperties + */ +CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev); +#endif + +#if defined(_WIN32) && __CUDA_API_VERSION >= 10000 +/** + * \brief Return an LUID and device node mask for the device + * + * Return identifying information (\p luid and \p deviceNodeMask) to allow + * matching device with graphics APIs. + * + * \param luid - Returned LUID + * \param deviceNodeMask - Returned device node mask + * \param dev - Device to get identifier string for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGet, + * ::cuDeviceTotalMem, + * ::cudaGetDeviceProperties + */ +CUresult CUDAAPI cuDeviceGetLuid(char *luid, unsigned int *deviceNodeMask, CUdevice dev); +#endif + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Returns the total amount of memory on the device + * + * Returns in \p *bytes the total amount of memory available on the device + * \p dev in bytes. + * + * \param bytes - Returned memory available on device in bytes + * \param dev - Device handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetUuid, + * ::cuDeviceGet, + * ::cudaMemGetInfo + */ +CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev); +#endif /* __CUDA_API_VERSION >= 3020 */ + +/** + * \brief Returns information about the device + * + * Returns in \p *pi the integer value of the attribute \p attrib on device + * \p dev. The supported attributes are: + * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads per + * block; + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block; + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block; + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block; + * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid; + * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid; + * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid; + * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of + * shared memory available to a thread block in bytes; + * - ::CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device for + * __constant__ variables in a CUDA C kernel in bytes; + * - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads; + * - ::CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the + * memory copy functions that involve memory regions allocated through + * ::cuMemAllocPitch(); + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D + * texture width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: Maximum width + * for a 1D texture bound to linear memory; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: Maximum + * mipmapped 1D texture width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D + * texture width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D + * texture height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: Maximum width + * for a 2D texture bound to linear memory; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: Maximum height + * for a 2D texture bound to linear memory; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: Maximum pitch + * in bytes for a 2D texture bound to linear memory; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: Maximum + * mipmapped 2D texture width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: Maximum + * mipmapped 2D texture height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D + * texture width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D + * texture height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D + * texture depth; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE: + * Alternate maximum 3D texture width, 0 if no alternate + * maximum 3D texture size is supported; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE: + * Alternate maximum 3D texture height, 0 if no alternate + * maximum 3D texture size is supported; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE: + * Alternate maximum 3D texture depth, 0 if no alternate + * maximum 3D texture size is supported; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH: + * Maximum cubemap texture width or height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH: + * Maximum 1D layered texture width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS: + * Maximum layers in a 1D layered texture; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH: + * Maximum 2D layered texture width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT: + * Maximum 2D layered texture height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS: + * Maximum layers in a 2D layered texture; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH: + * Maximum cubemap layered texture width or height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS: + * Maximum layers in a cubemap layered texture; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH: + * Maximum 1D surface width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH: + * Maximum 2D surface width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT: + * Maximum 2D surface height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH: + * Maximum 3D surface width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT: + * Maximum 3D surface height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH: + * Maximum 3D surface depth; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH: + * Maximum 1D layered surface width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS: + * Maximum layers in a 1D layered surface; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH: + * Maximum 2D layered surface width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT: + * Maximum 2D layered surface height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS: + * Maximum layers in a 2D layered surface; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH: + * Maximum cubemap surface width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH: + * Maximum cubemap layered surface width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS: + * Maximum layers in a cubemap layered surface; + * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit + * registers available to a thread block; + * - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz; + * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture + * base addresses aligned to ::textureAlign bytes do not need an offset + * applied to texture fetches; + * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: Pitch alignment requirement + * for 2D texture references bound to pitched memory; + * - ::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy + * memory between host and device while executing a kernel, or 0 if not; + * - ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on + * the device; + * - ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit + * for kernels executed on the device, or 0 if not; + * - ::CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the + * memory subsystem, or 0 if not; + * - ::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host + * memory into the CUDA address space, or 0 if not; + * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently + * in. Available modes are as follows: + * - ::CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted and + * can have multiple CUDA contexts present at a single time. + * - ::CU_COMPUTEMODE_PROHIBITED: Compute-prohibited mode - Device is + * prohibited from creating new CUDA contexts. + * - ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS: Compute-exclusive-process mode - Device + * can have only one context used by a single process at a time. + * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: 1 if the device supports + * executing multiple kernels within the same context simultaneously, or 0 if + * not. It is not guaranteed that multiple kernels will be resident + * on the device concurrently so this feature should not be relied upon for + * correctness; + * - ::CU_DEVICE_ATTRIBUTE_ECC_ENABLED: 1 if error correction is enabled on the + * device, 0 if error correction is disabled or not supported by the device; + * - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device; + * - ::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: PCI device (also known as slot) identifier + * of the device; + * - ::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID: PCI domain identifier of the device + * - ::CU_DEVICE_ATTRIBUTE_TCC_DRIVER: 1 if the device is using a TCC driver. TCC + * is only available on Tesla hardware running Windows Vista or later; + * - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz; + * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits; + * - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache; + * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor; + * - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified address space with + * the host, or 0 if not; + * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability version number; + * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability version number; + * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: 1 if device supports caching globals + * in L1 cache, 0 if caching globals in L1 cache is not supported by the device; + * - ::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: 1 if device supports caching locals + * in L1 cache, 0 if caching locals in L1 cache is not supported by the device; + * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: Maximum amount of + * shared memory available to a multiprocessor in bytes; this amount is shared + * by all thread blocks simultaneously resident on a multiprocessor; + * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: Maximum number of 32-bit + * registers available to a multiprocessor; this number is shared by all thread + * blocks simultaneously resident on a multiprocessor; + * - ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: 1 if device supports allocating managed memory + * on this system, 0 if allocating managed memory is not supported by the device on this system. + * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: 1 if device is on a multi-GPU board, 0 if not. + * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID: Unique identifier for a group of devices + * associated with the same board. Devices on the same multi-GPU board will share the same identifier. + * - ::CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED: 1 if Link between the device and the host + * supports native atomic operations. + * - ::CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO: Ratio of single precision performance + * (in floating-point operations per second) to double precision performance. + * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS: Device supports coherently accessing + * pageable memory without calling cudaHostRegister on it. + * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS: Device can coherently access managed memory + * concurrently with the CPU. + * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED: Device supports Compute Preemption. + * - ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: Device can access host registered + * memory at the same virtual address as the CPU. + * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN: The maximum per block shared memory size + * supported on this device. This is the maximum value that can be opted into when using the cuFuncSetAttribute() call. + * For more details see ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES + * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES: Device accesses pageable memory via the host's + * page tables. + * - ::CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: The host can directly access managed memory on the device without migration. + * + * \param pi - Returned device attribute value + * \param attrib - Device attribute to query + * \param dev - Device handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetUuid, + * ::cuDeviceGet, + * ::cuDeviceTotalMem, + * ::cudaDeviceGetAttribute, + * ::cudaGetDeviceProperties + */ +CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev); + +/** @} */ /* END CUDA_DEVICE */ + +/** + * \defgroup CUDA_DEVICE_DEPRECATED Device Management [DEPRECATED] + * + * ___MANBRIEF___ deprecated device management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the device management functions of the low-level + * CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns properties for a selected device + * + * \deprecated + * + * This function was deprecated as of CUDA 5.0 and replaced by ::cuDeviceGetAttribute(). + * + * Returns in \p *prop the properties of device \p dev. The ::CUdevprop + * structure is defined as: + * + * \code + typedef struct CUdevprop_st { + int maxThreadsPerBlock; + int maxThreadsDim[3]; + int maxGridSize[3]; + int sharedMemPerBlock; + int totalConstantMemory; + int SIMDWidth; + int memPitch; + int regsPerBlock; + int clockRate; + int textureAlign + } CUdevprop; + * \endcode + * where: + * + * - ::maxThreadsPerBlock is the maximum number of threads per block; + * - ::maxThreadsDim[3] is the maximum sizes of each dimension of a block; + * - ::maxGridSize[3] is the maximum sizes of each dimension of a grid; + * - ::sharedMemPerBlock is the total amount of shared memory available per + * block in bytes; + * - ::totalConstantMemory is the total amount of constant memory available on + * the device in bytes; + * - ::SIMDWidth is the warp size; + * - ::memPitch is the maximum pitch allowed by the memory copy functions that + * involve memory regions allocated through ::cuMemAllocPitch(); + * - ::regsPerBlock is the total number of registers available per block; + * - ::clockRate is the clock frequency in kilohertz; + * - ::textureAlign is the alignment requirement; texture base addresses that + * are aligned to ::textureAlign bytes do not need an offset applied to + * texture fetches. + * + * \param prop - Returned properties of device + * \param dev - Device to get properties for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetUuid, + * ::cuDeviceGet, + * ::cuDeviceTotalMem + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev); + +/** + * \brief Returns the compute capability of the device + * + * \deprecated + * + * This function was deprecated as of CUDA 5.0 and its functionality superseded + * by ::cuDeviceGetAttribute(). + * + * Returns in \p *major and \p *minor the major and minor revision numbers that + * define the compute capability of the device \p dev. + * + * \param major - Major revision number + * \param minor - Minor revision number + * \param dev - Device handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetUuid, + * ::cuDeviceGet, + * ::cuDeviceTotalMem + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev); + +/** @} */ /* END CUDA_DEVICE_DEPRECATED */ + +/** + * \defgroup CUDA_PRIMARY_CTX Primary Context Management + * + * ___MANBRIEF___ primary context management functions of the low-level CUDA driver + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the primary context management functions of the low-level + * CUDA driver application programming interface. + * + * The primary context is unique per device and shared with the CUDA runtime API. + * These functions allow integration with other libraries using CUDA. + * + * @{ + */ + +#if __CUDA_API_VERSION >= 7000 + +/** + * \brief Retain the primary context on the GPU + * + * Retains the primary context on the device, creating it if necessary, + * increasing its usage count. The caller must call + * ::cuDevicePrimaryCtxRelease() when done using the context. + * Unlike ::cuCtxCreate() the newly created context is not pushed onto the stack. + * + * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of + * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() + * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the compute mode + * of the device. + * The nvidia-smi tool can be used to set the compute mode for + * devices. Documentation for nvidia-smi can be obtained by passing a + * -h option to it. + * + * Please note that the primary context always supports pinned allocations. Other + * flags can be specified by ::cuDevicePrimaryCtxSetFlags(). + * + * \param pctx - Returned context handle of the new context + * \param dev - Device for which primary context is requested + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuDevicePrimaryCtxRelease, + * ::cuDevicePrimaryCtxSetFlags, + * ::cuCtxCreate, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev); + +/** + * \brief Release the primary context on the GPU + * + * Releases the primary context interop on the device by decreasing the usage + * count by 1. If the usage drops to 0 the primary context of device \p dev + * will be destroyed regardless of how many threads it is current to. + * + * Please note that unlike ::cuCtxDestroy() this method does not pop the context + * from stack in any circumstances. + * + * \param dev - Device which primary context is released + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa ::cuDevicePrimaryCtxRetain, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev); + +/** + * \brief Set flags for the primary context + * + * Sets the flags for the primary context on the device overwriting previously + * set ones. If the primary context is already created + * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE is returned. + * + * The three LSBs of the \p flags parameter can be used to control how the OS + * thread, which owns the CUDA context at the time of an API call, interacts + * with the OS scheduler when waiting for results from the GPU. Only one of + * the scheduling flags can be set when creating a context. + * + * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for + * results from the GPU. This can decrease latency when waiting for the GPU, + * but may lower the performance of CPU threads if they are performing work in + * parallel with the CUDA thread. + * + * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for + * results from the GPU. This can increase latency when waiting for the GPU, + * but can increase the performance of CPU threads performing work in parallel + * with the GPU. + * + * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work. + * + * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work.
+ * Deprecated: This flag was deprecated as of CUDA 4.0 and was + * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. + * + * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, + * uses a heuristic based on the number of active CUDA contexts in the + * process \e C and the number of logical processors in the system \e P. If + * \e C > \e P, then CUDA will yield to other OS threads when waiting for + * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while + * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). + * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on + * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC + * for low-powered devices. + * + * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory + * after resizing local memory for a kernel. This can prevent thrashing by + * local memory allocations when launching many kernels with high local + * memory usage at the cost of potentially increased memory usage. + * + * \param dev - Device for which the primary context flags are set + * \param flags - New flags for the device + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE + * \notefnerr + * + * \sa ::cuDevicePrimaryCtxRetain, + * ::cuDevicePrimaryCtxGetState, + * ::cuCtxCreate, + * ::cuCtxGetFlags, + * ::cudaSetDeviceFlags + */ +CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags); + +/** + * \brief Get the state of the primary context + * + * Returns in \p *flags the flags for the primary context of \p dev, and in + * \p *active whether it is active. See ::cuDevicePrimaryCtxSetFlags for flag + * values. + * + * \param dev - Device to get primary context flags for + * \param flags - Pointer to store flags + * \param active - Pointer to store context state; 0 = inactive, 1 = active + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + * \sa + * ::cuDevicePrimaryCtxSetFlags, + * ::cuCtxGetFlags, + * ::cudaGetDeviceFlags + */ +CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, int *active); + +/** + * \brief Destroy all allocations and reset all state on the primary context + * + * Explicitly destroys and cleans up all resources associated with the current + * device in the current process. + * + * Note that it is responsibility of the calling function to ensure that no + * other module in the process is using the device any more. For that reason + * it is recommended to use ::cuDevicePrimaryCtxRelease() in most cases. + * However it is safe for other modules to call ::cuDevicePrimaryCtxRelease() + * even after resetting the device. + * + * \param dev - Device for which primary context is destroyed + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE + * \notefnerr + * + * \sa ::cuDevicePrimaryCtxRetain, + * ::cuDevicePrimaryCtxRelease, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cudaDeviceReset + */ +CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev); + +#endif /* __CUDA_API_VERSION >= 7000 */ + +/** @} */ /* END CUDA_PRIMARY_CTX */ + + +/** + * \defgroup CUDA_CTX Context Management + * + * ___MANBRIEF___ context management functions of the low-level CUDA driver + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the context management functions of the low-level + * CUDA driver application programming interface. + * + * Please note that some functions are described in + * \ref CUDA_PRIMARY_CTX "Primary Context Management" section. + * + * @{ + */ + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Create a CUDA context + * + * \note In most cases it is recommended to use ::cuDevicePrimaryCtxRetain. + * + * Creates a new CUDA context and associates it with the calling thread. The + * \p flags parameter is described below. The context is created with a usage + * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy() + * when done using the context. If a context is already current to the thread, + * it is supplanted by the newly created context and may be restored by a subsequent + * call to ::cuCtxPopCurrent(). + * + * The three LSBs of the \p flags parameter can be used to control how the OS + * thread, which owns the CUDA context at the time of an API call, interacts + * with the OS scheduler when waiting for results from the GPU. Only one of + * the scheduling flags can be set when creating a context. + * + * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for + * results from the GPU. This can decrease latency when waiting for the GPU, + * but may lower the performance of CPU threads if they are performing work in + * parallel with the CUDA thread. + * + * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for + * results from the GPU. This can increase latency when waiting for the GPU, + * but can increase the performance of CPU threads performing work in parallel + * with the GPU. + * + * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work. + * + * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work.
+ * Deprecated: This flag was deprecated as of CUDA 4.0 and was + * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. + * + * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, + * uses a heuristic based on the number of active CUDA contexts in the + * process \e C and the number of logical processors in the system \e P. If + * \e C > \e P, then CUDA will yield to other OS threads when waiting for + * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while + * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). + * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on + * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC + * for low-powered devices. + * + * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations. + * This flag must be set in order to allocate pinned host memory that is + * accessible to the GPU. + * + * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory + * after resizing local memory for a kernel. This can prevent thrashing by + * local memory allocations when launching many kernels with high local + * memory usage at the cost of potentially increased memory usage. + * + * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of + * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() + * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the + * compute mode of the device. The nvidia-smi tool can be used to set + * the compute mode for * devices. + * Documentation for nvidia-smi can be obtained by passing a + * -h option to it. + * + * \param pctx - Returned context handle of the new context + * \param flags - Context creation flags + * \param dev - Device to create context on + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); +#endif /* __CUDA_API_VERSION >= 3020 */ + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Destroy a CUDA context + * + * Destroys the CUDA context specified by \p ctx. The context \p ctx will be + * destroyed regardless of how many threads it is current to. + * It is the responsibility of the calling function to ensure that no API + * call issues using \p ctx while ::cuCtxDestroy() is executing. + * + * If \p ctx is current to the calling thread then \p ctx will also be + * popped from the current thread's context stack (as though ::cuCtxPopCurrent() + * were called). If \p ctx is current to other threads, then \p ctx will + * remain current to those threads, and attempting to access \p ctx from + * those threads will result in the error ::CUDA_ERROR_CONTEXT_IS_DESTROYED. + * + * \param ctx - Context to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxDestroy(CUcontext ctx); +#endif /* __CUDA_API_VERSION >= 4000 */ + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Pushes a context on the current CPU thread + * + * Pushes the given context \p ctx onto the CPU thread's stack of current + * contexts. The specified context becomes the CPU thread's current context, so + * all CUDA functions that operate on the current context are affected. + * + * The previous current context may be made current again by calling + * ::cuCtxDestroy() or ::cuCtxPopCurrent(). + * + * \param ctx - Context to push + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx); + +/** + * \brief Pops the current CUDA context from the current CPU thread. + * + * Pops the current CUDA context from the CPU thread and passes back the + * old context handle in \p *pctx. That context may then be made current + * to a different CPU thread by calling ::cuCtxPushCurrent(). + * + * If a context was current to the CPU thread before ::cuCtxCreate() or + * ::cuCtxPushCurrent() was called, this function makes that context current to + * the CPU thread again. + * + * \param pctx - Returned new context handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx); + +/** + * \brief Binds the specified CUDA context to the calling CPU thread + * + * Binds the specified CUDA context to the calling CPU thread. + * If \p ctx is NULL then the CUDA context previously bound to the + * calling CPU thread is unbound and ::CUDA_SUCCESS is returned. + * + * If there exists a CUDA context stack on the calling CPU thread, this + * will replace the top of that stack with \p ctx. + * If \p ctx is NULL then this will be equivalent to popping the top + * of the calling CPU thread's CUDA context stack (or a no-op if the + * calling CPU thread's CUDA context stack is empty). + * + * \param ctx - Context to bind to the calling CPU thread + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa + * ::cuCtxGetCurrent, + * ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cudaSetDevice + */ +CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx); + +/** + * \brief Returns the CUDA context bound to the calling CPU thread. + * + * Returns in \p *pctx the CUDA context bound to the calling CPU thread. + * If no context is bound to the calling CPU thread then \p *pctx is + * set to NULL and ::CUDA_SUCCESS is returned. + * + * \param pctx - Returned context handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * \notefnerr + * + * \sa + * ::cuCtxSetCurrent, + * ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cudaGetDevice + */ +CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx); +#endif /* __CUDA_API_VERSION >= 4000 */ + +/** + * \brief Returns the device ID for the current context + * + * Returns in \p *device the ordinal of the current context's device. + * + * \param device - Returned device ID for the current context + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cudaGetDevice + */ +CUresult CUDAAPI cuCtxGetDevice(CUdevice *device); + +#if __CUDA_API_VERSION >= 7000 +/** + * \brief Returns the flags for the current context + * + * Returns in \p *flags the flags of the current context. See ::cuCtxCreate + * for flag values. + * + * \param flags - Pointer to store flags of current context + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetCurrent, + * ::cuCtxGetDevice + * ::cuCtxGetLimit, + * ::cuCtxGetSharedMemConfig, + * ::cuCtxGetStreamPriorityRange, + * ::cudaGetDeviceFlags + */ +CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags); +#endif /* __CUDA_API_VERSION >= 7000 */ + +/** + * \brief Block for a context's tasks to complete + * + * Blocks until the device has completed all preceding requested tasks. + * ::cuCtxSynchronize() returns an error if one of the preceding tasks failed. + * If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the + * CPU thread will block until the GPU context has finished its work. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cudaDeviceSynchronize + */ +CUresult CUDAAPI cuCtxSynchronize(void); + +/** + * \brief Set resource limits + * + * Setting \p limit to \p value is a request by the application to update + * the current limit maintained by the context. The driver is free to + * modify the requested value to meet h/w requirements (this could be + * clamping to minimum or maximum values, rounding up to nearest element + * size, etc). The application can use ::cuCtxGetLimit() to find out exactly + * what the limit has been set to. + * + * Setting each ::CUlimit has its own specific restrictions, so each is + * discussed here. + * + * - ::CU_LIMIT_STACK_SIZE controls the stack size in bytes of each GPU thread. + * Note that the CUDA driver will set the \p limit to the maximum of \p value + * and what the kernel function requires. + * + * - ::CU_LIMIT_PRINTF_FIFO_SIZE controls the size in bytes of the FIFO used + * by the ::printf() device system call. Setting ::CU_LIMIT_PRINTF_FIFO_SIZE + * must be performed before launching any kernel that uses the ::printf() + * device system call, otherwise ::CUDA_ERROR_INVALID_VALUE will be returned. + * + * - ::CU_LIMIT_MALLOC_HEAP_SIZE controls the size in bytes of the heap used + * by the ::malloc() and ::free() device system calls. Setting + * ::CU_LIMIT_MALLOC_HEAP_SIZE must be performed before launching any kernel + * that uses the ::malloc() or ::free() device system calls, otherwise + * ::CUDA_ERROR_INVALID_VALUE will be returned. + * + * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH controls the maximum nesting depth of + * a grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting + * this limit must be performed before any launch of a kernel that uses the + * device runtime and calls ::cudaDeviceSynchronize() above the default sync + * depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail + * with error code ::cudaErrorSyncDepthExceeded if the limitation is + * violated. This limit can be set smaller than the default or up the maximum + * launch depth of 24. When setting this limit, keep in mind that additional + * levels of sync depth require the driver to reserve large amounts of device + * memory which can no longer be used for user allocations. If these + * reservations of device memory fail, ::cuCtxSetLimit will return + * ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value. + * This limit is only applicable to devices of compute capability 3.5 and + * higher. Attempting to set this limit on devices of compute capability less + * than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being + * returned. + * + * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT controls the maximum number of + * outstanding device runtime launches that can be made from the current + * context. A grid is outstanding from the point of launch up until the grid + * is known to have been completed. Device runtime launches which violate + * this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when + * ::cudaGetLastError() is called after launch. If more pending launches than + * the default (2048 launches) are needed for a module using the device + * runtime, this limit can be increased. Keep in mind that being able to + * sustain additional pending launches will require the driver to reserve + * larger amounts of device memory upfront which can no longer be used for + * allocations. If these reservations fail, ::cuCtxSetLimit will return + * ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value. + * This limit is only applicable to devices of compute capability 3.5 and + * higher. Attempting to set this limit on devices of compute capability less + * than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being + * returned. + * + * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY controls the L2 cache fetch granularity. + * Values can range from 0B to 128B. This is purely a performance hint and + * it can be ignored or clamped depending on the platform. + * + * \param limit - Limit to set + * \param value - Size of limit + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNSUPPORTED_LIMIT, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSynchronize, + * ::cudaDeviceSetLimit + */ +CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value); + +/** + * \brief Returns resource limits + * + * Returns in \p *pvalue the current size of \p limit. The supported + * ::CUlimit values are: + * - ::CU_LIMIT_STACK_SIZE: stack size in bytes of each GPU thread. + * - ::CU_LIMIT_PRINTF_FIFO_SIZE: size in bytes of the FIFO used by the + * ::printf() device system call. + * - ::CU_LIMIT_MALLOC_HEAP_SIZE: size in bytes of the heap used by the + * ::malloc() and ::free() device system calls. + * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH: maximum grid depth at which a thread + * can issue the device runtime call ::cudaDeviceSynchronize() to wait on + * child grid launches to complete. + * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: maximum number of outstanding + * device runtime launches that can be made from this context. + * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY: L2 cache fetch granularity. + * + * \param limit - Limit to query + * \param pvalue - Returned size of limit + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNSUPPORTED_LIMIT + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cudaDeviceGetLimit + */ +CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit); + +/** + * \brief Returns the preferred cache configuration for the current context. + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this function returns through \p pconfig the preferred cache configuration + * for the current context. This is only a preference. The driver will use + * the requested configuration if possible, but it is free to choose a different + * configuration if required to execute functions. + * + * This will return a \p pconfig of ::CU_FUNC_CACHE_PREFER_NONE on devices + * where the size of the L1 cache and shared memory are fixed. + * + * The supported cache configurations are: + * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) + * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache + * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory + * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory + * + * \param pconfig - Returned cache configuration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cuFuncSetCacheConfig, + * ::cudaDeviceGetCacheConfig + */ +CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig); + +/** + * \brief Sets the preferred cache configuration for the current context. + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this sets through \p config the preferred cache configuration for + * the current context. This is only a preference. The driver will use + * the requested configuration if possible, but it is free to choose a different + * configuration if required to execute the function. Any function preference + * set via ::cuFuncSetCacheConfig() will be preferred over this context-wide + * setting. Setting the context-wide cache configuration to + * ::CU_FUNC_CACHE_PREFER_NONE will cause subsequent kernel launches to prefer + * to not change the cache configuration unless required to launch the kernel. + * + * This setting does nothing on devices where the size of the L1 cache and + * shared memory are fixed. + * + * Launching a kernel with a different preference than the most recent + * preference setting may insert a device-side synchronization point. + * + * The supported cache configurations are: + * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) + * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache + * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory + * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory + * + * \param config - Requested cache configuration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cuFuncSetCacheConfig, + * ::cudaDeviceSetCacheConfig + */ +CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config); + +#if __CUDA_API_VERSION >= 4020 +/** + * \brief Returns the current shared memory configuration for the current context. + * + * This function will return in \p pConfig the current size of shared memory banks + * in the current context. On devices with configurable shared memory banks, + * ::cuCtxSetSharedMemConfig can be used to change this setting, so that all + * subsequent kernel launches will by default use the new bank size. When + * ::cuCtxGetSharedMemConfig is called on devices without configurable shared + * memory, it will return the fixed bank size of the hardware. + * + * The returned bank configurations can be either: + * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: shared memory bank width is + * four bytes. + * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: shared memory bank width will + * eight bytes. + * + * \param pConfig - returned shared memory configuration + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cuCtxGetSharedMemConfig, + * ::cuFuncSetCacheConfig, + * ::cudaDeviceGetSharedMemConfig + */ +CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig); + +/** + * \brief Sets the shared memory configuration for the current context. + * + * On devices with configurable shared memory banks, this function will set + * the context's shared memory bank size which is used for subsequent kernel + * launches. + * + * Changed the shared memory configuration between launches may insert a device + * side synchronization point between those launches. + * + * Changing the shared memory bank size will not increase shared memory usage + * or affect occupancy of kernels, but may have major effects on performance. + * Larger bank sizes will allow for greater potential bandwidth to shared memory, + * but will change what kinds of accesses to shared memory will result in bank + * conflicts. + * + * This function will do nothing on devices with fixed shared memory bank size. + * + * The supported bank configurations are: + * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: set bank width to the default initial + * setting (currently, four bytes). + * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to + * be natively four bytes. + * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to + * be natively eight bytes. + * + * \param config - requested shared memory configuration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cuCtxGetSharedMemConfig, + * ::cuFuncSetCacheConfig, + * ::cudaDeviceSetSharedMemConfig + */ +CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config); +#endif + +/** + * \brief Gets the context's API version. + * + * Returns a version number in \p version corresponding to the capabilities of + * the context (e.g. 3010 or 3020), which library developers can use to direct + * callers to a specific API version. If \p ctx is NULL, returns the API version + * used to create the currently bound context. + * + * Note that new API versions are only introduced when context capabilities are + * changed that break binary compatibility, so the API version and driver version + * may be different. For example, it is valid for the API version to be 3020 while + * the driver version is 4020. + * + * \param ctx - Context to check + * \param version - Pointer to version + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version); + +/** + * \brief Returns numerical values that correspond to the least and + * greatest stream priorities. + * + * Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond + * to the least and greatest stream priorities respectively. Stream priorities + * follow a convention where lower numbers imply greater priorities. The range of + * meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority]. + * If the user attempts to create a stream with a priority value that is + * outside the meaningful range as specified by this API, the priority is + * automatically clamped down or up to either \p *leastPriority or \p *greatestPriority + * respectively. See ::cuStreamCreateWithPriority for details on creating a + * priority stream. + * A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value + * is not desired. + * + * This function will return '0' in both \p *leastPriority and \p *greatestPriority if + * the current context's device does not support stream priorities + * (see ::cuDeviceGetAttribute). + * + * \param leastPriority - Pointer to an int in which the numerical value for least + * stream priority is returned + * \param greatestPriority - Pointer to an int in which the numerical value for greatest + * stream priority is returned + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + * \sa ::cuStreamCreateWithPriority, + * ::cuStreamGetPriority, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cudaDeviceGetStreamPriorityRange + */ +CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority, int *greatestPriority); + +/** @} */ /* END CUDA_CTX */ + +/** + * \defgroup CUDA_CTX_DEPRECATED Context Management [DEPRECATED] + * + * ___MANBRIEF___ deprecated context management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the deprecated context management functions of the low-level + * CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Increment a context's usage-count + * + * \deprecated + * + * Note that this function is deprecated and should not be used. + * + * Increments the usage count of the context and passes back a context handle + * in \p *pctx that must be passed to ::cuCtxDetach() when the application is + * done with the context. ::cuCtxAttach() fails if there is no context current + * to the thread. + * + * Currently, the \p flags parameter must be 0. + * + * \param pctx - Returned context handle of the current context + * \param flags - Context attach flags (must be 0) + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxDetach, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx, unsigned int flags); + +/** + * \brief Decrement a context's usage-count + * + * \deprecated + * + * Note that this function is deprecated and should not be used. + * + * Decrements the usage count of the context \p ctx, and destroys the context + * if the usage count goes to 0. The context must be a handle that was passed + * back by ::cuCtxCreate() or ::cuCtxAttach(), and must be current to the + * calling thread. + * + * \param ctx - Context to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx); + +/** @} */ /* END CUDA_CTX_DEPRECATED */ + + +/** + * \defgroup CUDA_MODULE Module Management + * + * ___MANBRIEF___ module management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the module management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Loads a compute module + * + * Takes a filename \p fname and loads the corresponding module \p module into + * the current context. The CUDA driver API does not attempt to lazily + * allocate the resources needed by a module; if the memory for functions and + * data (constant and global) needed by the module cannot be allocated, + * ::cuModuleLoad() fails. The file should be a \e cubin file as output by + * \b nvcc, or a \e PTX file either as output by \b nvcc or handwritten, or + * a \e fatbin file as output by \b nvcc from toolchain 4.0 or later. + * + * \param module - Returned module + * \param fname - Filename of module to load + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_NOT_FOUND, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_FILE_NOT_FOUND, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU, + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, + * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname); + +/** + * \brief Load a module's data + * + * Takes a pointer \p image and loads the corresponding module \p module into + * the current context. The pointer may be obtained by mapping a \e cubin or + * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file + * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin + * object into the executable resources and using operating system calls such + * as Windows \c FindResource() to obtain the pointer. + * + * \param module - Returned module + * \param image - Module data to load + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU, + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, + * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image); + +/** + * \brief Load a module's data with options + * + * Takes a pointer \p image and loads the corresponding module \p module into + * the current context. The pointer may be obtained by mapping a \e cubin or + * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file + * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin + * object into the executable resources and using operating system calls such + * as Windows \c FindResource() to obtain the pointer. Options are passed as + * an array via \p options and any corresponding parameters are passed in + * \p optionValues. The number of total options is supplied via \p numOptions. + * Any outputs will be returned via \p optionValues. + * + * \param module - Returned module + * \param image - Module data to load + * \param numOptions - Number of options + * \param options - Options for JIT + * \param optionValues - Option values for JIT + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU, + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, + * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues); + +/** + * \brief Load a module's data + * + * Takes a pointer \p fatCubin and loads the corresponding module \p module + * into the current context. The pointer represents a fat binary object, + * which is a collection of different \e cubin and/or \e PTX files, all + * representing the same device code, but compiled and optimized for different + * architectures. + * + * Prior to CUDA 4.0, there was no documented API for constructing and using + * fat binary objects by programmers. Starting with CUDA 4.0, fat binary + * objects can be constructed by providing the -fatbin option to \b nvcc. + * More information can be found in the \b nvcc document. + * + * \param module - Returned module + * \param fatCubin - Fat binary to load + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_NOT_FOUND, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU, + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, + * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin); + +/** + * \brief Unloads a module + * + * Unloads a module \p hmod from the current context. + * + * \param hmod - Module to unload + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary + */ +CUresult CUDAAPI cuModuleUnload(CUmodule hmod); + +/** + * \brief Returns a function handle + * + * Returns in \p *hfunc the handle of the function of name \p name located in + * module \p hmod. If no function of that name exists, ::cuModuleGetFunction() + * returns ::CUDA_ERROR_NOT_FOUND. + * + * \param hfunc - Returned function handle + * \param hmod - Module to retrieve function from + * \param name - Name of function to retrieve + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name); + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Returns a global pointer from a module + * + * Returns in \p *dptr and \p *bytes the base pointer and size of the + * global of name \p name located in module \p hmod. If no variable of that name + * exists, ::cuModuleGetGlobal() returns ::CUDA_ERROR_NOT_FOUND. Both + * parameters \p dptr and \p bytes are optional. If one of them is + * NULL, it is ignored. + * + * \param dptr - Returned global device pointer + * \param bytes - Returned global size in bytes + * \param hmod - Module to retrieve global from + * \param name - Name of global to retrieve + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload, + * ::cudaGetSymbolAddress, + * ::cudaGetSymbolSize + */ +CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name); +#endif /* __CUDA_API_VERSION >= 3020 */ + +/** + * \brief Returns a handle to a texture reference + * + * Returns in \p *pTexRef the handle of the texture reference of name \p name + * in the module \p hmod. If no texture reference of that name exists, + * ::cuModuleGetTexRef() returns ::CUDA_ERROR_NOT_FOUND. This texture reference + * handle should not be destroyed, since it will be destroyed when the module + * is unloaded. + * + * \param pTexRef - Returned texture reference + * \param hmod - Module to retrieve texture reference from + * \param name - Name of texture reference to retrieve + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetSurfRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload, + * ::cudaGetTextureReference + */ +CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name); + +/** + * \brief Returns a handle to a surface reference + * + * Returns in \p *pSurfRef the handle of the surface reference of name \p name + * in the module \p hmod. If no surface reference of that name exists, + * ::cuModuleGetSurfRef() returns ::CUDA_ERROR_NOT_FOUND. + * + * \param pSurfRef - Returned surface reference + * \param hmod - Module to retrieve surface reference from + * \param name - Name of surface reference to retrieve + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload, + * ::cudaGetSurfaceReference + */ +CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name); + +#if __CUDA_API_VERSION >= 5050 + +/** + * \brief Creates a pending JIT linker invocation. + * + * If the call is successful, the caller owns the returned CUlinkState, which + * should eventually be destroyed with ::cuLinkDestroy. The + * device code machine size (32 or 64 bit) will match the calling application. + * + * Both linker and compiler options may be specified. Compiler options will + * be applied to inputs to this linker action which must be compiled from PTX. + * The options ::CU_JIT_WALL_TIME, + * ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, and ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES + * will accumulate data until the CUlinkState is destroyed. + * + * \p optionValues must remain valid for the life of the CUlinkState if output + * options are used. No other references to inputs are maintained after this + * call returns. + * + * \param numOptions Size of options arrays + * \param options Array of linker and compiler options + * \param optionValues Array of option values, each cast to void * + * \param stateOut On success, this will contain a CUlinkState to specify + * and complete this action + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND + * \notefnerr + * + * \sa ::cuLinkAddData, + * ::cuLinkAddFile, + * ::cuLinkComplete, + * ::cuLinkDestroy + */ +CUresult CUDAAPI +cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut); + +/** + * \brief Add an input to a pending linker invocation + * + * Ownership of \p data is retained by the caller. No reference is retained to any + * inputs after this call returns. + * + * This method accepts only compiler options, which are used if the data must + * be compiled from PTX, and does not accept any of + * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER, + * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET. + * + * \param state A pending linker action. + * \param type The type of the input data. + * \param data The input data. PTX must be NULL-terminated. + * \param size The length of the input data. + * \param name An optional name for this input in log messages. + * \param numOptions Size of options. + * \param options Options to be applied only for this input (overrides options from ::cuLinkCreate). + * \param optionValues Array of option values, each cast to void *. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_IMAGE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU + * + * \sa ::cuLinkCreate, + * ::cuLinkAddFile, + * ::cuLinkComplete, + * ::cuLinkDestroy + */ +CUresult CUDAAPI +cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, + unsigned int numOptions, CUjit_option *options, void **optionValues); + +/** + * \brief Add a file input to a pending linker invocation + * + * No reference is retained to any inputs after this call returns. + * + * This method accepts only compiler options, which are used if the input + * must be compiled from PTX, and does not accept any of + * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER, + * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET. + * + * This method is equivalent to invoking ::cuLinkAddData on the contents + * of the file. + * + * \param state A pending linker action + * \param type The type of the input data + * \param path Path to the input file + * \param numOptions Size of options + * \param options Options to be applied only for this input (overrides options from ::cuLinkCreate) + * \param optionValues Array of option values, each cast to void * + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_FILE_NOT_FOUND + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_IMAGE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU + * + * \sa ::cuLinkCreate, + * ::cuLinkAddData, + * ::cuLinkComplete, + * ::cuLinkDestroy + */ +CUresult CUDAAPI +cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path, + unsigned int numOptions, CUjit_option *options, void **optionValues); + +/** + * \brief Complete a pending linker invocation + * + * Completes the pending linker action and returns the cubin image for the linked + * device code, which can be used with ::cuModuleLoadData. The cubin is owned by + * \p state, so it should be loaded before \p state is destroyed via ::cuLinkDestroy. + * This call does not destroy \p state. + * + * \param state A pending linker invocation + * \param cubinOut On success, this will point to the output image + * \param sizeOut Optional parameter to receive the size of the generated image + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa ::cuLinkCreate, + * ::cuLinkAddData, + * ::cuLinkAddFile, + * ::cuLinkDestroy, + * ::cuModuleLoadData + */ +CUresult CUDAAPI +cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut); + +/** + * \brief Destroys state for a JIT linker invocation. + * + * \param state State object for the linker invocation + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE + * + * \sa ::cuLinkCreate + */ +CUresult CUDAAPI +cuLinkDestroy(CUlinkState state); + +#endif /* __CUDA_API_VERSION >= 5050 */ + +/** @} */ /* END CUDA_MODULE */ + + +/** + * \defgroup CUDA_MEM Memory Management + * + * ___MANBRIEF___ memory management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the memory management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Gets free and total memory + * + * Returns in \p *free and \p *total respectively, the free and total amount of + * memory available for allocation by the CUDA context, in bytes. + * + * \param free - Returned free memory in bytes + * \param total - Returned total memory in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemGetInfo + */ +CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total); + +/** + * \brief Allocates device memory + * + * Allocates \p bytesize bytes of linear memory on the device and returns in + * \p *dptr a pointer to the allocated memory. The allocated memory is suitably + * aligned for any kind of variable. The memory is not cleared. If \p bytesize + * is 0, ::cuMemAlloc() returns ::CUDA_ERROR_INVALID_VALUE. + * + * \param dptr - Returned device pointer + * \param bytesize - Requested allocation size in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMalloc + */ +CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize); + +/** + * \brief Allocates pitched device memory + * + * Allocates at least \p WidthInBytes * \p Height bytes of linear memory on + * the device and returns in \p *dptr a pointer to the allocated memory. The + * function may pad the allocation to ensure that corresponding pointers in + * any given row will continue to meet the alignment requirements for + * coalescing as the address is updated from row to row. \p ElementSizeBytes + * specifies the size of the largest reads and writes that will be performed + * on the memory range. \p ElementSizeBytes may be 4, 8 or 16 (since coalesced + * memory transactions are not possible on other data sizes). If + * \p ElementSizeBytes is smaller than the actual read/write size of a kernel, + * the kernel will run correctly, but possibly at reduced speed. The pitch + * returned in \p *pPitch by ::cuMemAllocPitch() is the width in bytes of the + * allocation. The intended usage of pitch is as a separate parameter of the + * allocation, used to compute addresses within the 2D array. Given the row + * and column of an array element of type \b T, the address is computed as: + * \code + T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column; + * \endcode + * + * The pitch returned by ::cuMemAllocPitch() is guaranteed to work with + * ::cuMemcpy2D() under all circumstances. For allocations of 2D arrays, it is + * recommended that programmers consider performing pitch allocations using + * ::cuMemAllocPitch(). Due to alignment restrictions in the hardware, this is + * especially true if the application will be performing 2D memory copies + * between different regions of device memory (whether linear memory or CUDA + * arrays). + * + * The byte alignment of the pitch returned by ::cuMemAllocPitch() is guaranteed + * to match or exceed the alignment requirement for texture binding with + * ::cuTexRefSetAddress2D(). + * + * \param dptr - Returned device pointer + * \param pPitch - Returned pitch of allocation in bytes + * \param WidthInBytes - Requested allocation width in bytes + * \param Height - Requested allocation height in rows + * \param ElementSizeBytes - Size of largest reads/writes for range + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMallocPitch + */ +CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes); + +/** + * \brief Frees device memory + * + * Frees the memory space pointed to by \p dptr, which must have been returned + * by a previous call to ::cuMemAlloc() or ::cuMemAllocPitch(). + * + * \param dptr - Pointer to memory to free + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaFree + */ +CUresult CUDAAPI cuMemFree(CUdeviceptr dptr); + +/** + * \brief Get information on memory allocations + * + * Returns the base address in \p *pbase and size in \p *psize of the + * allocation by ::cuMemAlloc() or ::cuMemAllocPitch() that contains the input + * pointer \p dptr. Both parameters \p pbase and \p psize are optional. If one + * of them is NULL, it is ignored. + * + * \param pbase - Returned base address + * \param psize - Returned size of device memory allocation + * \param dptr - Device pointer to query + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_NOT_FOUND, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr); + +/** + * \brief Allocates page-locked host memory + * + * Allocates \p bytesize bytes of host memory that is page-locked and + * accessible to the device. The driver tracks the virtual memory ranges + * allocated with this function and automatically accelerates calls to + * functions such as ::cuMemcpy(). Since the memory can be accessed directly by + * the device, it can be read or written with much higher bandwidth than + * pageable memory obtained with functions such as ::malloc(). Allocating + * excessive amounts of memory with ::cuMemAllocHost() may degrade system + * performance, since it reduces the amount of memory available to the system + * for paging. As a result, this function is best used sparingly to allocate + * staging areas for data exchange between host and device. + * + * Note all host memory allocated using ::cuMemHostAlloc() will automatically + * be immediately accessible to all contexts on all devices which support unified + * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING). + * The device pointer that may be used to access this host memory from those + * contexts is always equal to the returned host pointer \p *pp. + * See \ref CUDA_UNIFIED for additional details. + * + * \param pp - Returned host pointer to page-locked memory + * \param bytesize - Requested allocation size in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMallocHost + */ +CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize); +#endif /* __CUDA_API_VERSION >= 3020 */ + +/** + * \brief Frees page-locked host memory + * + * Frees the memory space pointed to by \p p, which must have been returned by + * a previous call to ::cuMemAllocHost(). + * + * \param p - Pointer to memory to free + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaFreeHost + */ +CUresult CUDAAPI cuMemFreeHost(void *p); + +/** + * \brief Allocates page-locked host memory + * + * Allocates \p bytesize bytes of host memory that is page-locked and accessible + * to the device. The driver tracks the virtual memory ranges allocated with + * this function and automatically accelerates calls to functions such as + * ::cuMemcpyHtoD(). Since the memory can be accessed directly by the device, + * it can be read or written with much higher bandwidth than pageable memory + * obtained with functions such as ::malloc(). Allocating excessive amounts of + * pinned memory may degrade system performance, since it reduces the amount + * of memory available to the system for paging. As a result, this function is + * best used sparingly to allocate staging areas for data exchange between + * host and device. + * + * The \p Flags parameter enables different options to be specified that + * affect the allocation, as follows. + * + * - ::CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be + * considered as pinned memory by all CUDA contexts, not just the one that + * performed the allocation. + * + * - ::CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA address + * space. The device pointer to the memory may be obtained by calling + * ::cuMemHostGetDevicePointer(). + * + * - ::CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined + * (WC). WC memory can be transferred across the PCI Express bus more + * quickly on some system configurations, but cannot be read efficiently by + * most CPUs. WC memory is a good option for buffers that will be written by + * the CPU and read by the GPU via mapped pinned memory or host->device + * transfers. + * + * All of these flags are orthogonal to one another: a developer may allocate + * memory that is portable, mapped and/or write-combined with no restrictions. + * + * The CUDA context must have been created with the ::CU_CTX_MAP_HOST flag in + * order for the ::CU_MEMHOSTALLOC_DEVICEMAP flag to have any effect. + * + * The ::CU_MEMHOSTALLOC_DEVICEMAP flag may be specified on CUDA contexts for + * devices that do not support mapped pinned memory. The failure is deferred + * to ::cuMemHostGetDevicePointer() because the memory may be mapped into + * other CUDA contexts via the ::CU_MEMHOSTALLOC_PORTABLE flag. + * + * The memory allocated by this function must be freed with ::cuMemFreeHost(). + * + * Note all host memory allocated using ::cuMemHostAlloc() will automatically + * be immediately accessible to all contexts on all devices which support unified + * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING). + * Unless the flag ::CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device pointer + * that may be used to access this host memory from those contexts is always equal + * to the returned host pointer \p *pp. If the flag ::CU_MEMHOSTALLOC_WRITECOMBINED + * is specified, then the function ::cuMemHostGetDevicePointer() must be used + * to query the device pointer, even if the context supports unified addressing. + * See \ref CUDA_UNIFIED for additional details. + * + * \param pp - Returned host pointer to page-locked memory + * \param bytesize - Requested allocation size in bytes + * \param Flags - Flags for allocation request + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaHostAlloc + */ +CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags); + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Passes back device pointer of mapped pinned memory + * + * Passes back the device pointer \p pdptr corresponding to the mapped, pinned + * host buffer \p p allocated by ::cuMemHostAlloc. + * + * ::cuMemHostGetDevicePointer() will fail if the ::CU_MEMHOSTALLOC_DEVICEMAP + * flag was not specified at the time the memory was allocated, or if the + * function is called on a GPU that does not support mapped pinned memory. + * + * For devices that have a non-zero value for the device attribute + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory + * can also be accessed from the device using the host pointer \p p. + * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not + * match the original host pointer \p p and depends on the devices visible to the + * application. If all devices visible to the application have a non-zero value for the + * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer() + * will match the original pointer \p p. If any device visible to the application + * has a zero value for the device attribute, the device pointer returned by + * ::cuMemHostGetDevicePointer() will not match the original host pointer \p p, + * but it will be suitable for use on all devices provided Unified Virtual Addressing + * is enabled. In such systems, it is valid to access the memory using either pointer + * on devices that have a non-zero value for the device attribute. Note however that + * such devices should access the memory using only of the two pointers and not both. + * + * \p Flags provides for future releases. For now, it must be set to 0. + * + * \param pdptr - Returned device pointer + * \param p - Host pointer + * \param Flags - Options (must be 0) + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaHostGetDevicePointer + */ +CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags); +#endif /* __CUDA_API_VERSION >= 3020 */ + +/** + * \brief Passes back flags that were used for a pinned allocation + * + * Passes back the flags \p pFlags that were specified when allocating + * the pinned host buffer \p p allocated by ::cuMemHostAlloc. + * + * ::cuMemHostGetFlags() will fail if the pointer does not reside in + * an allocation performed by ::cuMemAllocHost() or ::cuMemHostAlloc(). + * + * \param pFlags - Returned flags word + * \param p - Host pointer + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cuMemAllocHost, + * ::cuMemHostAlloc, + * ::cudaHostGetFlags + */ +CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p); + +#if __CUDA_API_VERSION >= 6000 + +/** + * \brief Allocates memory that will be automatically managed by the Unified Memory system + * + * Allocates \p bytesize bytes of managed memory on the device and returns in + * \p *dptr a pointer to the allocated memory. If the device doesn't support + * allocating managed memory, ::CUDA_ERROR_NOT_SUPPORTED is returned. Support + * for managed memory can be queried using the device attribute + * ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY. The allocated memory is suitably + * aligned for any kind of variable. The memory is not cleared. If \p bytesize + * is 0, ::cuMemAllocManaged returns ::CUDA_ERROR_INVALID_VALUE. The pointer + * is valid on the CPU and on all GPUs in the system that support managed memory. + * All accesses to this pointer must obey the Unified Memory programming model. + * + * \p flags specifies the default stream association for this allocation. + * \p flags must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST. If + * ::CU_MEM_ATTACH_GLOBAL is specified, then this memory is accessible from + * any stream on any device. If ::CU_MEM_ATTACH_HOST is specified, then the + * allocation should not be accessed from devices that have a zero value for the + * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS; an explicit call to + * ::cuStreamAttachMemAsync will be required to enable access on such devices. + * + * If the association is later changed via ::cuStreamAttachMemAsync to + * a single stream, the default association as specified during ::cuMemAllocManaged + * is restored when that stream is destroyed. For __managed__ variables, the + * default association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a + * stream is an asynchronous operation, and as a result, the change to default + * association won't happen until all work in the stream has completed. + * + * Memory allocated with ::cuMemAllocManaged should be released with ::cuMemFree. + * + * Device memory oversubscription is possible for GPUs that have a non-zero value for the + * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Managed memory on + * such GPUs may be evicted from device memory to host memory at any time by the Unified + * Memory driver in order to make room for other allocations. + * + * In a multi-GPU system where all GPUs have a non-zero value for the device attribute + * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, managed memory may not be populated when this + * API returns and instead may be populated on access. In such systems, managed memory can + * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to + * maintain data locality and prevent excessive page faults to the extent possible. The application + * can also guide the driver about memory usage patterns via ::cuMemAdvise. The application + * can also explicitly migrate memory to a desired processor's memory via + * ::cuMemPrefetchAsync. + * + * In a multi-GPU system where all of the GPUs have a zero value for the device attribute + * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS and all the GPUs have peer-to-peer support + * with each other, the physical storage for managed memory is created on the GPU which is active + * at the time ::cuMemAllocManaged is called. All other GPUs will reference the data at reduced + * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate + * memory among such GPUs. + * + * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and + * where the value of the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS + * is zero for at least one of those GPUs, the location chosen for physical storage of managed + * memory is system-dependent. + * - On Linux, the location chosen will be device memory as long as the current set of active + * contexts are on devices that either have peer-to-peer support with each other or have a + * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + * If there is an active context on a GPU that does not have a non-zero value for that device + * attribute and it does not have peer-to-peer support with the other devices that have active + * contexts on them, then the location for physical storage will be 'zero-copy' or host memory. + * Note that this means that managed memory that is located in device memory is migrated to + * host memory if a new context is created on a GPU that doesn't have a non-zero value for + * the device attribute and does not support peer-to-peer with at least one of the other devices + * that has an active context. This in turn implies that context creation may fail if there is + * insufficient host memory to migrate all managed allocations. + * - On Windows, the physical storage is always created in 'zero-copy' or host memory. + * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these + * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to + * restrict CUDA to only use those GPUs that have peer-to-peer support. + * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a + * non-zero value to force the driver to always use device memory for physical storage. + * When this environment variable is set to a non-zero value, all contexts created in + * that process on devices that support managed memory have to be peer-to-peer compatible + * with each other. Context creation will fail if a context is created on a device that + * supports managed memory and is not peer-to-peer compatible with any of the other + * managed memory supporting devices on which contexts were previously created, even if + * those contexts have been destroyed. These environment variables are described + * in the CUDA programming guide under the "CUDA environment variables" section. + * - On ARM, managed memory is not available on discrete gpu with Drive PX-2. + * + * \param dptr - Returned device pointer + * \param bytesize - Requested allocation size in bytes + * \param flags - Must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cuDeviceGetAttribute, ::cuStreamAttachMemAsync, + * ::cudaMallocManaged + */ +CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize, unsigned int flags); + +#endif /* __CUDA_API_VERSION >= 6000 */ + +#if __CUDA_API_VERSION >= 4010 + +/** + * \brief Returns a handle to a compute device + * + * Returns in \p *device a device handle given a PCI bus ID string. + * + * \param dev - Returned device handle + * + * \param pciBusId - String in one of the following forms: + * [domain]:[bus]:[device].[function] + * [domain]:[bus]:[device] + * [bus]:[device].[function] + * where \p domain, \p bus, \p device, and \p function are all hexadecimal values + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGet, + * ::cuDeviceGetAttribute, + * ::cuDeviceGetPCIBusId, + * ::cudaDeviceGetByPCIBusId + */ +CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId); + +/** + * \brief Returns a PCI Bus Id string for the device + * + * Returns an ASCII string identifying the device \p dev in the NULL-terminated + * string pointed to by \p pciBusId. \p len specifies the maximum length of the + * string that may be returned. + * + * \param pciBusId - Returned identifier string for the device in the following format + * [domain]:[bus]:[device].[function] + * where \p domain, \p bus, \p device, and \p function are all hexadecimal values. + * pciBusId should be large enough to store 13 characters including the NULL-terminator. + * + * \param len - Maximum length of string to store in \p name + * + * \param dev - Device to get identifier string for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGet, + * ::cuDeviceGetAttribute, + * ::cuDeviceGetByPCIBusId, + * ::cudaDeviceGetPCIBusId + */ +CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev); + +/** + * \brief Gets an interprocess handle for a previously allocated event + * + * Takes as input a previously allocated event. This event must have been + * created with the ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING + * flags set. This opaque handle may be copied into other processes and + * opened with ::cuIpcOpenEventHandle to allow efficient hardware + * synchronization between GPU work in different processes. + * + * After the event has been opened in the importing process, + * ::cuEventRecord, ::cuEventSynchronize, ::cuStreamWaitEvent and + * ::cuEventQuery may be used in either process. Performing operations + * on the imported event after the exported event has been freed + * with ::cuEventDestroy will result in undefined behavior. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux and Windows operating systems. + * IPC functionality on Windows is restricted to GPUs in TCC mode + * + * \param pHandle - Pointer to a user allocated CUipcEventHandle + * in which to return the opaque event handle + * \param event - Event allocated with ::CU_EVENT_INTERPROCESS and + * ::CU_EVENT_DISABLE_TIMING flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_MAP_FAILED, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuEventCreate, + * ::cuEventDestroy, + * ::cuEventSynchronize, + * ::cuEventQuery, + * ::cuStreamWaitEvent, + * ::cuIpcOpenEventHandle, + * ::cuIpcGetMemHandle, + * ::cuIpcOpenMemHandle, + * ::cuIpcCloseMemHandle, + * ::cudaIpcGetEventHandle + */ +CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event); + +/** + * \brief Opens an interprocess event handle for use in the current process + * + * Opens an interprocess event handle exported from another process with + * ::cuIpcGetEventHandle. This function returns a ::CUevent that behaves like + * a locally created event with the ::CU_EVENT_DISABLE_TIMING flag specified. + * This event must be freed with ::cuEventDestroy. + * + * Performing operations on the imported event after the exported event has + * been freed with ::cuEventDestroy will result in undefined behavior. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux and Windows operating systems. + * IPC functionality on Windows is restricted to GPUs in TCC mode + * + * \param phEvent - Returns the imported event + * \param handle - Interprocess handle to open + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_MAP_FAILED, + * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuEventCreate, + * ::cuEventDestroy, + * ::cuEventSynchronize, + * ::cuEventQuery, + * ::cuStreamWaitEvent, + * ::cuIpcGetEventHandle, + * ::cuIpcGetMemHandle, + * ::cuIpcOpenMemHandle, + * ::cuIpcCloseMemHandle, + * ::cudaIpcOpenEventHandle + */ +CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle); + +/** + * \brief Gets an interprocess memory handle for an existing device memory + * allocation + * + * Takes a pointer to the base of an existing device memory allocation created + * with ::cuMemAlloc and exports it for use in another process. This is a + * lightweight operation and may be called multiple times on an allocation + * without adverse effects. + * + * If a region of memory is freed with ::cuMemFree and a subsequent call + * to ::cuMemAlloc returns memory with the same device address, + * ::cuIpcGetMemHandle will return a unique handle for the + * new memory. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux and Windows operating systems. + * IPC functionality on Windows is restricted to GPUs in TCC mode + * + * \param pHandle - Pointer to user allocated ::CUipcMemHandle to return + * the handle in. + * \param dptr - Base pointer to previously allocated device memory + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_MAP_FAILED, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuMemAlloc, + * ::cuMemFree, + * ::cuIpcGetEventHandle, + * ::cuIpcOpenEventHandle, + * ::cuIpcOpenMemHandle, + * ::cuIpcCloseMemHandle, + * ::cudaIpcGetMemHandle + */ +CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr); + +/** + * \brief Opens an interprocess memory handle exported from another process + * and returns a device pointer usable in the local process. + * + * Maps memory exported from another process with ::cuIpcGetMemHandle into + * the current device address space. For contexts on different devices + * ::cuIpcOpenMemHandle can attempt to enable peer access between the + * devices as if the user called ::cuCtxEnablePeerAccess. This behavior is + * controlled by the ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS flag. + * ::cuDeviceCanAccessPeer can determine if a mapping is possible. + * + * ::cuIpcOpenMemHandle can open handles to devices that may not be visible + * in the process calling the API. + * + * Contexts that may open ::CUipcMemHandles are restricted in the following way. + * ::CUipcMemHandles from each ::CUdevice in a given process may only be opened + * by one ::CUcontext per ::CUdevice per other process. + * + * Memory returned from ::cuIpcOpenMemHandle must be freed with + * ::cuIpcCloseMemHandle. + * + * Calling ::cuMemFree on an exported memory region before calling + * ::cuIpcCloseMemHandle in the importing context will result in undefined + * behavior. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux and Windows operating systems. + * IPC functionality on Windows is restricted to GPUs in TCC mode + * + * \param pdptr - Returned device pointer + * \param handle - ::CUipcMemHandle to open + * \param Flags - Flags for this operation. Must be specified as ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_MAP_FAILED, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_TOO_MANY_PEERS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \note No guarantees are made about the address returned in \p *pdptr. + * In particular, multiple processes may not receive the same address for the same \p handle. + * + * \sa + * ::cuMemAlloc, + * ::cuMemFree, + * ::cuIpcGetEventHandle, + * ::cuIpcOpenEventHandle, + * ::cuIpcGetMemHandle, + * ::cuIpcCloseMemHandle, + * ::cuCtxEnablePeerAccess, + * ::cuDeviceCanAccessPeer, + * ::cudaIpcOpenMemHandle + */ +CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags); + +/** + * \brief Close memory mapped with ::cuIpcOpenMemHandle + * + * Unmaps memory returned by ::cuIpcOpenMemHandle. The original allocation + * in the exporting process as well as imported mappings in other processes + * will be unaffected. + * + * Any resources used to enable peer access will be freed if this is the + * last mapping using them. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux and Windows operating systems. + * IPC functionality on Windows is restricted to GPUs in TCC mode + * + * \param dptr - Device pointer returned by ::cuIpcOpenMemHandle + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_MAP_FAILED, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \sa + * ::cuMemAlloc, + * ::cuMemFree, + * ::cuIpcGetEventHandle, + * ::cuIpcOpenEventHandle, + * ::cuIpcGetMemHandle, + * ::cuIpcOpenMemHandle, + * ::cudaIpcCloseMemHandle + */ +CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr); + +#endif /* __CUDA_API_VERSION >= 4010 */ + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Registers an existing host memory range for use by CUDA + * + * Page-locks the memory range specified by \p p and \p bytesize and maps it + * for the device(s) as specified by \p Flags. This memory range also is added + * to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + * calls to functions such as ::cuMemcpyHtoD(). Since the memory can be accessed + * directly by the device, it can be read or written with much higher bandwidth + * than pageable memory that has not been registered. Page-locking excessive + * amounts of memory may degrade system performance, since it reduces the amount + * of memory available to the system for paging. As a result, this function is + * best used sparingly to register staging areas for data exchange between + * host and device. + * + * This function has limited support on Mac OS X. OS 10.7 or higher is required. + * + * The \p Flags parameter enables different options to be specified that + * affect the allocation, as follows. + * + * - ::CU_MEMHOSTREGISTER_PORTABLE: The memory returned by this call will be + * considered as pinned memory by all CUDA contexts, not just the one that + * performed the allocation. + * + * - ::CU_MEMHOSTREGISTER_DEVICEMAP: Maps the allocation into the CUDA address + * space. The device pointer to the memory may be obtained by calling + * ::cuMemHostGetDevicePointer(). + * + * - ::CU_MEMHOSTREGISTER_IOMEMORY: The pointer is treated as pointing to some + * I/O memory space, e.g. the PCI Express resource of a 3rd party device. + * + * All of these flags are orthogonal to one another: a developer may page-lock + * memory that is portable or mapped with no restrictions. + * + * The CUDA context must have been created with the ::CU_CTX_MAP_HOST flag in + * order for the ::CU_MEMHOSTREGISTER_DEVICEMAP flag to have any effect. + * + * The ::CU_MEMHOSTREGISTER_DEVICEMAP flag may be specified on CUDA contexts for + * devices that do not support mapped pinned memory. The failure is deferred + * to ::cuMemHostGetDevicePointer() because the memory may be mapped into + * other CUDA contexts via the ::CU_MEMHOSTREGISTER_PORTABLE flag. + * + * For devices that have a non-zero value for the device attribute + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory + * can also be accessed from the device using the host pointer \p p. + * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not + * match the original host pointer \p ptr and depends on the devices visible to the + * application. If all devices visible to the application have a non-zero value for the + * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer() + * will match the original pointer \p ptr. If any device visible to the application + * has a zero value for the device attribute, the device pointer returned by + * ::cuMemHostGetDevicePointer() will not match the original host pointer \p ptr, + * but it will be suitable for use on all devices provided Unified Virtual Addressing + * is enabled. In such systems, it is valid to access the memory using either pointer + * on devices that have a non-zero value for the device attribute. Note however that + * such devices should access the memory using only of the two pointers and not both. + * + * The memory page-locked by this function must be unregistered with + * ::cuMemHostUnregister(). + * + * \param p - Host pointer to memory to page-lock + * \param bytesize - Size in bytes of the address range to page-lock + * \param Flags - Flags for allocation request + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED, + * ::CUDA_ERROR_NOT_PERMITTED, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa + * ::cuMemHostUnregister, + * ::cuMemHostGetFlags, + * ::cuMemHostGetDevicePointer, + * ::cudaHostRegister + */ +CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags); + +/** + * \brief Unregisters a memory range that was registered with cuMemHostRegister. + * + * Unmaps the memory range whose base address is specified by \p p, and makes + * it pageable again. + * + * The base address must be the same one specified to ::cuMemHostRegister(). + * + * \param p - Host pointer to memory to unregister + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, + * \notefnerr + * + * \sa + * ::cuMemHostRegister, + * ::cudaHostUnregister + */ +CUresult CUDAAPI cuMemHostUnregister(void *p); + +/** + * \brief Copies memory + * + * Copies data between two pointers. + * \p dst and \p src are base pointers of the destination and source, respectively. + * \p ByteCount specifies the number of bytes to copy. + * Note that this function infers the type of the transfer (host to host, host to + * device, device to device, or device to host) from the pointer values. This + * function is only allowed in contexts which support unified addressing. + * + * \param dst - Destination unified virtual address space pointer + * \param src - Source unified virtual address space pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy, + * ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol + */ +CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); + +/** + * \brief Copies device memory between two contexts + * + * Copies from device memory in one context to device memory in another + * context. \p dstDevice is the base device pointer of the destination memory + * and \p dstContext is the destination context. \p srcDevice is the base + * device pointer of the source memory and \p srcContext is the source pointer. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param dstContext - Destination context + * \param srcDevice - Source device pointer + * \param srcContext - Source context + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuMemcpyDtoD, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, + * ::cuMemcpy3DPeerAsync, + * ::cudaMemcpyPeer + */ +CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount); + +#endif /* __CUDA_API_VERSION >= 4000 */ + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Copies memory from Host to Device + * + * Copies from host memory to device memory. \p dstDevice and \p srcHost are + * the base addresses of the destination and source, respectively. \p ByteCount + * specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param srcHost - Source host pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy, + * ::cudaMemcpyToSymbol + */ +CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); + +/** + * \brief Copies memory from Device to Host + * + * Copies from device to host memory. \p dstHost and \p srcDevice specify the + * base pointers of the destination and source, respectively. \p ByteCount + * specifies the number of bytes to copy. + * + * \param dstHost - Destination host pointer + * \param srcDevice - Source device pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy, + * ::cudaMemcpyFromSymbol + */ +CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); + +/** + * \brief Copies memory from Device to Device + * + * Copies from device memory to device memory. \p dstDevice and \p srcDevice + * are the base pointers of the destination and source, respectively. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param srcDevice - Source device pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy, + * ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol + */ +CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); + +/** + * \brief Copies memory from Device to Array + * + * Copies from device memory to a 1D CUDA array. \p dstArray and \p dstOffset + * specify the CUDA array handle and starting index of the destination data. + * \p srcDevice specifies the base pointer of the source. \p ByteCount + * specifies the number of bytes to copy. + * + * \param dstArray - Destination array + * \param dstOffset - Offset in bytes of destination array + * \param srcDevice - Source device pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpyToArray + */ +CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount); + +/** + * \brief Copies memory from Array to Device + * + * Copies from one 1D CUDA array to device memory. \p dstDevice specifies the + * base pointer of the destination and must be naturally aligned with the CUDA + * array elements. \p srcArray and \p srcOffset specify the CUDA array handle + * and the offset in bytes into the array where the copy is to begin. + * \p ByteCount specifies the number of bytes to copy and must be evenly + * divisible by the array element size. + * + * \param dstDevice - Destination device pointer + * \param srcArray - Source array + * \param srcOffset - Offset in bytes of source array + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpyFromArray + */ +CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount); + +/** + * \brief Copies memory from Host to Array + * + * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset + * specify the CUDA array handle and starting offset in bytes of the destination + * data. \p pSrc specifies the base address of the source. \p ByteCount specifies + * the number of bytes to copy. + * + * \param dstArray - Destination array + * \param dstOffset - Offset in bytes of destination array + * \param srcHost - Source host pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpyToArray + */ +CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount); + +/** + * \brief Copies memory from Array to Host + * + * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base + * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA + * array handle and starting offset in bytes of the source data. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstHost - Destination device pointer + * \param srcArray - Source array + * \param srcOffset - Offset in bytes of source array + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpyFromArray + */ +CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount); + +/** + * \brief Copies memory from Array to Array + * + * Copies from one 1D CUDA array to another. \p dstArray and \p srcArray + * specify the handles of the destination and source CUDA arrays for the copy, + * respectively. \p dstOffset and \p srcOffset specify the destination and + * source offsets in bytes into the CUDA arrays. \p ByteCount is the number of + * bytes to be copied. The size of the elements in the CUDA arrays need not be + * the same format, but the elements must be the same size; and count must be + * evenly divisible by that size. + * + * \param dstArray - Destination array + * \param dstOffset - Offset in bytes of destination array + * \param srcArray - Source array + * \param srcOffset - Offset in bytes of source array + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpyArrayToArray + */ +CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount); + +/** + * \brief Copies memory for 2D arrays + * + * Perform a 2D memory copy according to the parameters specified in \p pCopy. + * The ::CUDA_MEMCPY2D structure is defined as: + * + * \code + typedef struct CUDA_MEMCPY2D_st { + unsigned int srcXInBytes, srcY; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + unsigned int srcPitch; + + unsigned int dstXInBytes, dstY; + CUmemorytype dstMemoryType; + void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + unsigned int dstPitch; + + unsigned int WidthInBytes; + unsigned int Height; + } CUDA_MEMCPY2D; + * \endcode + * where: + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the + * source and destination, respectively; ::CUmemorytype_enum is defined as: + * + * \code + typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, + CU_MEMORYTYPE_DEVICE = 0x02, + CU_MEMORYTYPE_ARRAY = 0x03, + CU_MEMORYTYPE_UNIFIED = 0x04 + } CUmemorytype; + * \endcode + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch + * specify the (host) base address of the source data and the bytes per row to + * apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch + * specify the (device) base address of the source data and the bytes per row + * to apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the + * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are + * ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch + * specify the (host) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch + * specify the (device) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the + * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are + * ignored. + * + * - ::srcXInBytes and ::srcY specify the base address of the source data for + * the copy. + * + * \par + * For host pointers, the starting address is + * \code + void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array + * element size. + * + * - ::dstXInBytes and ::dstY specify the base address of the destination data + * for the copy. + * + * \par + * For host pointers, the base address is + * \code + void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array + * element size. + * + * - ::WidthInBytes and ::Height specify the width (in bytes) and height of + * the 2D copy being performed. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * + * \par + * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back + * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies + * (device to device, CUDA array to device, CUDA array to CUDA array), + * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch(). + * ::cuMemcpy2DUnaligned() does not have this restriction, but may run + * significantly slower in the cases where ::cuMemcpy2D() would have returned + * an error code. + * + * \param pCopy - Parameters for the memory copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, + * ::cudaMemcpy2DFromArray + */ +CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy); + +/** + * \brief Copies memory for 2D arrays + * + * Perform a 2D memory copy according to the parameters specified in \p pCopy. + * The ::CUDA_MEMCPY2D structure is defined as: + * + * \code + typedef struct CUDA_MEMCPY2D_st { + unsigned int srcXInBytes, srcY; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + unsigned int srcPitch; + unsigned int dstXInBytes, dstY; + CUmemorytype dstMemoryType; + void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + unsigned int dstPitch; + unsigned int WidthInBytes; + unsigned int Height; + } CUDA_MEMCPY2D; + * \endcode + * where: + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the + * source and destination, respectively; ::CUmemorytype_enum is defined as: + * + * \code + typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, + CU_MEMORYTYPE_DEVICE = 0x02, + CU_MEMORYTYPE_ARRAY = 0x03, + CU_MEMORYTYPE_UNIFIED = 0x04 + } CUmemorytype; + * \endcode + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch + * specify the (host) base address of the source data and the bytes per row to + * apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch + * specify the (device) base address of the source data and the bytes per row + * to apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the + * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are + * ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch + * specify the (host) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch + * specify the (device) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the + * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are + * ignored. + * + * - ::srcXInBytes and ::srcY specify the base address of the source data for + * the copy. + * + * \par + * For host pointers, the starting address is + * \code + void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array + * element size. + * + * - ::dstXInBytes and ::dstY specify the base address of the destination data + * for the copy. + * + * \par + * For host pointers, the base address is + * \code + void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array + * element size. + * + * - ::WidthInBytes and ::Height specify the width (in bytes) and height of + * the 2D copy being performed. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * + * \par + * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back + * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies + * (device to device, CUDA array to device, CUDA array to CUDA array), + * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch(). + * ::cuMemcpy2DUnaligned() does not have this restriction, but may run + * significantly slower in the cases where ::cuMemcpy2D() would have returned + * an error code. + * + * \param pCopy - Parameters for the memory copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, + * ::cudaMemcpy2DFromArray + */ +CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy); + +/** + * \brief Copies memory for 3D arrays + * + * Perform a 3D memory copy according to the parameters specified in + * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as: + * + * \code + typedef struct CUDA_MEMCPY3D_st { + + unsigned int srcXInBytes, srcY, srcZ; + unsigned int srcLOD; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + unsigned int srcPitch; // ignored when src is array + unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1 + + unsigned int dstXInBytes, dstY, dstZ; + unsigned int dstLOD; + CUmemorytype dstMemoryType; + void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + unsigned int dstPitch; // ignored when dst is array + unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1 + + unsigned int WidthInBytes; + unsigned int Height; + unsigned int Depth; + } CUDA_MEMCPY3D; + * \endcode + * where: + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the + * source and destination, respectively; ::CUmemorytype_enum is defined as: + * + * \code + typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, + CU_MEMORYTYPE_DEVICE = 0x02, + CU_MEMORYTYPE_ARRAY = 0x03, + CU_MEMORYTYPE_UNIFIED = 0x04 + } CUmemorytype; + * \endcode + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and + * ::srcHeight specify the (host) base address of the source data, the bytes + * per row, and the height of each 2D slice of the 3D array. ::srcArray is + * ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and + * ::srcHeight specify the (device) base address of the source data, the bytes + * per row, and the height of each 2D slice of the 3D array. ::srcArray is + * ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the + * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and + * ::srcHeight are ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch + * specify the (host) base address of the destination data, the bytes per row, + * and the height of each 2D slice of the 3D array. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch + * specify the (device) base address of the destination data, the bytes per + * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the + * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and + * ::dstHeight are ignored. + * + * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source + * data for the copy. + * + * \par + * For host pointers, the starting address is + * \code + void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array + * element size. + * + * - dstXInBytes, ::dstY and ::dstZ specify the base address of the + * destination data for the copy. + * + * \par + * For host pointers, the base address is + * \code + void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array + * element size. + * + * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height + * and depth of the 3D copy being performed. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * - If specified, ::srcHeight must be greater than or equal to ::Height + + * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. + * + * \par + * ::cuMemcpy3D() returns an error if any pitch is greater than the maximum + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). + * + * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be + * set to 0. + * + * \param pCopy - Parameters for the memory copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy3D + */ +CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy); +#endif /* __CUDA_API_VERSION >= 3020 */ + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Copies memory between contexts + * + * Perform a 3D memory copy according to the parameters specified in + * \p pCopy. See the definition of the ::CUDA_MEMCPY3D_PEER structure + * for documentation of its parameters. + * + * \param pCopy - Parameters for the memory copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, + * ::cuMemcpy3DPeerAsync, + * ::cudaMemcpy3DPeer + */ +CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy); + +/** + * \brief Copies memory asynchronously + * + * Copies data between two pointers. + * \p dst and \p src are base pointers of the destination and source, respectively. + * \p ByteCount specifies the number of bytes to copy. + * Note that this function infers the type of the transfer (host to host, host to + * device, device to device, or device to host) from the pointer values. This + * function is only allowed in contexts which support unified addressing. + * + * \param dst - Destination unified virtual address space pointer + * \param src - Source unified virtual address space pointer + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpyAsync, + * ::cudaMemcpyToSymbolAsync, + * ::cudaMemcpyFromSymbolAsync + */ +CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies device memory between two contexts asynchronously. + * + * Copies from device memory in one context to device memory in another + * context. \p dstDevice is the base device pointer of the destination memory + * and \p dstContext is the destination context. \p srcDevice is the base + * device pointer of the source memory and \p srcContext is the source pointer. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param dstContext - Destination context + * \param srcDevice - Source device pointer + * \param srcContext - Source context + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, + * ::cuMemcpy3DPeerAsync, + * ::cudaMemcpyPeerAsync + */ +CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream); +#endif /* __CUDA_API_VERSION >= 4000 */ + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Copies memory from Host to Device + * + * Copies from host memory to device memory. \p dstDevice and \p srcHost are + * the base addresses of the destination and source, respectively. \p ByteCount + * specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param srcHost - Source host pointer + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpyAsync, + * ::cudaMemcpyToSymbolAsync + */ +CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory from Device to Host + * + * Copies from device to host memory. \p dstHost and \p srcDevice specify the + * base pointers of the destination and source, respectively. \p ByteCount + * specifies the number of bytes to copy. + * + * \param dstHost - Destination host pointer + * \param srcDevice - Source device pointer + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpyAsync, + * ::cudaMemcpyFromSymbolAsync + */ +CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory from Device to Device + * + * Copies from device memory to device memory. \p dstDevice and \p srcDevice + * are the base pointers of the destination and source, respectively. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param srcDevice - Source device pointer + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpyAsync, + * ::cudaMemcpyToSymbolAsync, + * ::cudaMemcpyFromSymbolAsync + */ +CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory from Host to Array + * + * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset + * specify the CUDA array handle and starting offset in bytes of the + * destination data. \p srcHost specifies the base address of the source. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstArray - Destination array + * \param dstOffset - Offset in bytes of destination array + * \param srcHost - Source host pointer + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpyToArrayAsync + */ +CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory from Array to Host + * + * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base + * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA + * array handle and starting offset in bytes of the source data. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstHost - Destination pointer + * \param srcArray - Source array + * \param srcOffset - Offset in bytes of source array + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpyFromArrayAsync + */ +CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory for 2D arrays + * + * Perform a 2D memory copy according to the parameters specified in \p pCopy. + * The ::CUDA_MEMCPY2D structure is defined as: + * + * \code + typedef struct CUDA_MEMCPY2D_st { + unsigned int srcXInBytes, srcY; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + unsigned int srcPitch; + unsigned int dstXInBytes, dstY; + CUmemorytype dstMemoryType; + void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + unsigned int dstPitch; + unsigned int WidthInBytes; + unsigned int Height; + } CUDA_MEMCPY2D; + * \endcode + * where: + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the + * source and destination, respectively; ::CUmemorytype_enum is defined as: + * + * \code + typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, + CU_MEMORYTYPE_DEVICE = 0x02, + CU_MEMORYTYPE_ARRAY = 0x03, + CU_MEMORYTYPE_UNIFIED = 0x04 + } CUmemorytype; + * \endcode + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch + * specify the (host) base address of the source data and the bytes per row to + * apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch + * specify the (device) base address of the source data and the bytes per row + * to apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the + * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are + * ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch + * specify the (host) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch + * specify the (device) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the + * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are + * ignored. + * + * - ::srcXInBytes and ::srcY specify the base address of the source data for + * the copy. + * + * \par + * For host pointers, the starting address is + * \code + void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array + * element size. + * + * - ::dstXInBytes and ::dstY specify the base address of the destination data + * for the copy. + * + * \par + * For host pointers, the base address is + * \code + void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array + * element size. + * + * - ::WidthInBytes and ::Height specify the width (in bytes) and height of + * the 2D copy being performed. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * - If specified, ::srcHeight must be greater than or equal to ::Height + + * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. + * + * \par + * ::cuMemcpy2DAsync() returns an error if any pitch is greater than the maximum + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back + * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies + * (device to device, CUDA array to device, CUDA array to CUDA array), + * ::cuMemcpy2DAsync() may fail for pitches not computed by ::cuMemAllocPitch(). + * + * \param pCopy - Parameters for the memory copy + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync + */ +CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream); + +/** + * \brief Copies memory for 3D arrays + * + * Perform a 3D memory copy according to the parameters specified in + * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as: + * + * \code + typedef struct CUDA_MEMCPY3D_st { + + unsigned int srcXInBytes, srcY, srcZ; + unsigned int srcLOD; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + unsigned int srcPitch; // ignored when src is array + unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1 + + unsigned int dstXInBytes, dstY, dstZ; + unsigned int dstLOD; + CUmemorytype dstMemoryType; + void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + unsigned int dstPitch; // ignored when dst is array + unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1 + + unsigned int WidthInBytes; + unsigned int Height; + unsigned int Depth; + } CUDA_MEMCPY3D; + * \endcode + * where: + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the + * source and destination, respectively; ::CUmemorytype_enum is defined as: + * + * \code + typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, + CU_MEMORYTYPE_DEVICE = 0x02, + CU_MEMORYTYPE_ARRAY = 0x03, + CU_MEMORYTYPE_UNIFIED = 0x04 + } CUmemorytype; + * \endcode + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and + * ::srcHeight specify the (host) base address of the source data, the bytes + * per row, and the height of each 2D slice of the 3D array. ::srcArray is + * ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and + * ::srcHeight specify the (device) base address of the source data, the bytes + * per row, and the height of each 2D slice of the 3D array. ::srcArray is + * ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the + * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and + * ::srcHeight are ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch + * specify the (host) base address of the destination data, the bytes per row, + * and the height of each 2D slice of the 3D array. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch + * specify the (device) base address of the destination data, the bytes per + * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the + * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and + * ::dstHeight are ignored. + * + * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source + * data for the copy. + * + * \par + * For host pointers, the starting address is + * \code + void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array + * element size. + * + * - dstXInBytes, ::dstY and ::dstZ specify the base address of the + * destination data for the copy. + * + * \par + * For host pointers, the base address is + * \code + void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array + * element size. + * + * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height + * and depth of the 3D copy being performed. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * - If specified, ::srcHeight must be greater than or equal to ::Height + + * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. + * + * \par + * ::cuMemcpy3DAsync() returns an error if any pitch is greater than the maximum + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). + * + * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be + * set to 0. + * + * \param pCopy - Parameters for the memory copy + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpy3DAsync + */ +CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream); +#endif /* __CUDA_API_VERSION >= 3020 */ + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Copies memory between contexts asynchronously. + * + * Perform a 3D memory copy according to the parameters specified in + * \p pCopy. See the definition of the ::CUDA_MEMCPY3D_PEER structure + * for documentation of its parameters. + * + * \param pCopy - Parameters for the memory copy + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, + * ::cuMemcpy3DPeerAsync, + * ::cudaMemcpy3DPeerAsync + */ +CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream); +#endif /* __CUDA_API_VERSION >= 4000 */ + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Initializes device memory + * + * Sets the memory range of \p N 8-bit values to the specified value + * \p uc. + * + * \param dstDevice - Destination device pointer + * \param uc - Value to set + * \param N - Number of elements + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset + */ +CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N); + +/** + * \brief Initializes device memory + * + * Sets the memory range of \p N 16-bit values to the specified value + * \p us. The \p dstDevice pointer must be two byte aligned. + * + * \param dstDevice - Destination device pointer + * \param us - Value to set + * \param N - Number of elements + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset + */ +CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N); + +/** + * \brief Initializes device memory + * + * Sets the memory range of \p N 32-bit values to the specified value + * \p ui. The \p dstDevice pointer must be four byte aligned. + * + * \param dstDevice - Destination device pointer + * \param ui - Value to set + * \param N - Number of elements + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32Async, + * ::cudaMemset + */ +CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N); + +/** + * \brief Initializes device memory + * + * Sets the 2D memory range of \p Width 8-bit values to the specified value + * \p uc. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer + * \param uc - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset2D + */ +CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height); + +/** + * \brief Initializes device memory + * + * Sets the 2D memory range of \p Width 16-bit values to the specified value + * \p us. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. The \p dstDevice pointer + * and \p dstPitch offset must be two byte aligned. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer + * \param us - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset2D + */ +CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height); + +/** + * \brief Initializes device memory + * + * Sets the 2D memory range of \p Width 32-bit values to the specified value + * \p ui. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. The \p dstDevice pointer + * and \p dstPitch offset must be four byte aligned. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer + * \param ui - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset2D + */ +CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height); + +/** + * \brief Sets device memory + * + * Sets the memory range of \p N 8-bit values to the specified value + * \p uc. + * + * \param dstDevice - Destination device pointer + * \param uc - Value to set + * \param N - Number of elements + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemsetAsync + */ +CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream); + +/** + * \brief Sets device memory + * + * Sets the memory range of \p N 16-bit values to the specified value + * \p us. The \p dstDevice pointer must be two byte aligned. + * + * \param dstDevice - Destination device pointer + * \param us - Value to set + * \param N - Number of elements + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemsetAsync + */ +CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream); + +/** + * \brief Sets device memory + * + * Sets the memory range of \p N 32-bit values to the specified value + * \p ui. The \p dstDevice pointer must be four byte aligned. + * + * \param dstDevice - Destination device pointer + * \param ui - Value to set + * \param N - Number of elements + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, ::cuMemsetD32, + * ::cudaMemsetAsync + */ +CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream); + +/** + * \brief Sets device memory + * + * Sets the 2D memory range of \p Width 8-bit values to the specified value + * \p uc. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer + * \param uc - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset2DAsync + */ +CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream); + +/** + * \brief Sets device memory + * + * Sets the 2D memory range of \p Width 16-bit values to the specified value + * \p us. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. The \p dstDevice pointer + * and \p dstPitch offset must be two byte aligned. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer + * \param us - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset2DAsync + */ +CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream); + +/** + * \brief Sets device memory + * + * Sets the 2D memory range of \p Width 32-bit values to the specified value + * \p ui. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. The \p dstDevice pointer + * and \p dstPitch offset must be four byte aligned. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer + * \param ui - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset2DAsync + */ +CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream); + +/** + * \brief Creates a 1D or 2D CUDA array + * + * Creates a CUDA array according to the ::CUDA_ARRAY_DESCRIPTOR structure + * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle. + * The ::CUDA_ARRAY_DESCRIPTOR is defined as: + * + * \code + typedef struct { + unsigned int Width; + unsigned int Height; + CUarray_format Format; + unsigned int NumChannels; + } CUDA_ARRAY_DESCRIPTOR; + * \endcode + * where: + * + * - \p Width, and \p Height are the width, and height of the CUDA array (in + * elements); the CUDA array is one-dimensional if height is 0, two-dimensional + * otherwise; + * - ::Format specifies the format of the elements; ::CUarray_format is + * defined as: + * \code + typedef enum CUarray_format_enum { + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, + CU_AD_FORMAT_SIGNED_INT8 = 0x08, + CU_AD_FORMAT_SIGNED_INT16 = 0x09, + CU_AD_FORMAT_SIGNED_INT32 = 0x0a, + CU_AD_FORMAT_HALF = 0x10, + CU_AD_FORMAT_FLOAT = 0x20 + } CUarray_format; + * \endcode + * - \p NumChannels specifies the number of packed components per CUDA array + * element; it may be 1, 2, or 4; + * + * Here are examples of CUDA array descriptions: + * + * Description for a CUDA array of 2048 floats: + * \code + CUDA_ARRAY_DESCRIPTOR desc; + desc.Format = CU_AD_FORMAT_FLOAT; + desc.NumChannels = 1; + desc.Width = 2048; + desc.Height = 1; + * \endcode + * + * Description for a 64 x 64 CUDA array of floats: + * \code + CUDA_ARRAY_DESCRIPTOR desc; + desc.Format = CU_AD_FORMAT_FLOAT; + desc.NumChannels = 1; + desc.Width = 64; + desc.Height = 64; + * \endcode + * + * Description for a \p width x \p height CUDA array of 64-bit, 4x16-bit + * float16's: + * \code + CUDA_ARRAY_DESCRIPTOR desc; + desc.FormatFlags = CU_AD_FORMAT_HALF; + desc.NumChannels = 4; + desc.Width = width; + desc.Height = height; + * \endcode + * + * Description for a \p width x \p height CUDA array of 16-bit elements, each + * of which is two 8-bit unsigned chars: + * \code + CUDA_ARRAY_DESCRIPTOR arrayDesc; + desc.FormatFlags = CU_AD_FORMAT_UNSIGNED_INT8; + desc.NumChannels = 2; + desc.Width = width; + desc.Height = height; + * \endcode + * + * \param pHandle - Returned array + * \param pAllocateArray - Array descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMallocArray + */ +CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray); + +/** + * \brief Get a 1D or 2D CUDA array descriptor + * + * Returns in \p *pArrayDescriptor a descriptor containing information on the + * format and dimensions of the CUDA array \p hArray. It is useful for + * subroutines that have been passed a CUDA array, but need to know the CUDA + * array parameters for validation or other purposes. + * + * \param pArrayDescriptor - Returned array descriptor + * \param hArray - Array to get descriptor of + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaArrayGetInfo + */ +CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray); +#endif /* __CUDA_API_VERSION >= 3020 */ + + +/** + * \brief Destroys a CUDA array + * + * Destroys the CUDA array \p hArray. + * + * \param hArray - Array to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ARRAY_IS_MAPPED, + * ::CUDA_ERROR_CONTEXT_IS_DESTROYED + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaFreeArray + */ +CUresult CUDAAPI cuArrayDestroy(CUarray hArray); + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Creates a 3D CUDA array + * + * Creates a CUDA array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure + * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle. + * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as: + * + * \code + typedef struct { + unsigned int Width; + unsigned int Height; + unsigned int Depth; + CUarray_format Format; + unsigned int NumChannels; + unsigned int Flags; + } CUDA_ARRAY3D_DESCRIPTOR; + * \endcode + * where: + * + * - \p Width, \p Height, and \p Depth are the width, height, and depth of the + * CUDA array (in elements); the following types of CUDA arrays can be allocated: + * - A 1D array is allocated if \p Height and \p Depth extents are both zero. + * - A 2D array is allocated if only \p Depth extent is zero. + * - A 3D array is allocated if all three extents are non-zero. + * - A 1D layered CUDA array is allocated if only \p Height is zero and the + * ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number + * of layers is determined by the depth extent. + * - A 2D layered CUDA array is allocated if all three extents are non-zero and + * the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number + * of layers is determined by the depth extent. + * - A cubemap CUDA array is allocated if all three extents are non-zero and the + * ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and + * \p Depth must be six. A cubemap is a special type of 2D layered CUDA array, + * where the six layers represent the six faces of a cube. The order of the six + * layers in memory is the same as that listed in ::CUarray_cubemap_face. + * - A cubemap layered CUDA array is allocated if all three extents are non-zero, + * and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set. + * \p Width must be equal to \p Height, and \p Depth must be a multiple of six. + * A cubemap layered CUDA array is a special type of 2D layered CUDA array that + * consists of a collection of cubemaps. The first six layers represent the first + * cubemap, the next six layers form the second cubemap, and so on. + * + * - ::Format specifies the format of the elements; ::CUarray_format is + * defined as: + * \code + typedef enum CUarray_format_enum { + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, + CU_AD_FORMAT_SIGNED_INT8 = 0x08, + CU_AD_FORMAT_SIGNED_INT16 = 0x09, + CU_AD_FORMAT_SIGNED_INT32 = 0x0a, + CU_AD_FORMAT_HALF = 0x10, + CU_AD_FORMAT_FLOAT = 0x20 + } CUarray_format; + * \endcode + * + * - \p NumChannels specifies the number of packed components per CUDA array + * element; it may be 1, 2, or 4; + * + * - ::Flags may be set to + * - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA arrays. If this flag is set, + * \p Depth specifies the number of layers, not the depth of a 3D array. + * - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to the CUDA array. + * If this flag is not set, ::cuSurfRefSetArray will fail when attempting to bind the CUDA array + * to a surface reference. + * - ::CUDA_ARRAY3D_CUBEMAP to enable creation of cubemaps. If this flag is set, \p Width must be + * equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set, + * then \p Depth must be a multiple of six. + * - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA array will be used for texture gather. + * Texture gather can only be performed on 2D CUDA arrays. + * + * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table. + * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute + * is not specified. For ex., TEXTURE1D_WIDTH refers to the device attribute + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH. + * + * Note that 2D CUDA arrays have different size requirements if the ::CUDA_ARRAY3D_TEXTURE_GATHER flag + * is set. \p Width and \p Height must not be greater than ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH + * and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT respectively, in that case. + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
CUDA array typeValid extents that must always be met
{(width range in elements), (height range), + * (depth range)}
Valid extents with CUDA_ARRAY3D_SURFACE_LDST set
+ * {(width range in elements), (height range), (depth range)}
1D{ (1,TEXTURE1D_WIDTH), 0, 0 }{ (1,SURFACE1D_WIDTH), 0, 0 }
2D{ (1,TEXTURE2D_WIDTH), (1,TEXTURE2D_HEIGHT), 0 }{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }
3D{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) } + *
OR
{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), + * (1,TEXTURE3D_DEPTH_ALTERNATE) }
{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT), + * (1,SURFACE3D_DEPTH) }
1D Layered{ (1,TEXTURE1D_LAYERED_WIDTH), 0, + * (1,TEXTURE1D_LAYERED_LAYERS) }{ (1,SURFACE1D_LAYERED_WIDTH), 0, + * (1,SURFACE1D_LAYERED_LAYERS) }
2D Layered{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), + * (1,TEXTURE2D_LAYERED_LAYERS) }{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT), + * (1,SURFACE2D_LAYERED_LAYERS) }
Cubemap{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }{ (1,SURFACECUBEMAP_WIDTH), + * (1,SURFACECUBEMAP_WIDTH), 6 }
Cubemap Layered{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), + * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH), + * (1,SURFACECUBEMAP_LAYERED_LAYERS) }
+ * + * Here are examples of CUDA array descriptions: + * + * Description for a CUDA array of 2048 floats: + * \code + CUDA_ARRAY3D_DESCRIPTOR desc; + desc.Format = CU_AD_FORMAT_FLOAT; + desc.NumChannels = 1; + desc.Width = 2048; + desc.Height = 0; + desc.Depth = 0; + * \endcode + * + * Description for a 64 x 64 CUDA array of floats: + * \code + CUDA_ARRAY3D_DESCRIPTOR desc; + desc.Format = CU_AD_FORMAT_FLOAT; + desc.NumChannels = 1; + desc.Width = 64; + desc.Height = 64; + desc.Depth = 0; + * \endcode + * + * Description for a \p width x \p height x \p depth CUDA array of 64-bit, + * 4x16-bit float16's: + * \code + CUDA_ARRAY3D_DESCRIPTOR desc; + desc.FormatFlags = CU_AD_FORMAT_HALF; + desc.NumChannels = 4; + desc.Width = width; + desc.Height = height; + desc.Depth = depth; + * \endcode + * + * \param pHandle - Returned array + * \param pAllocateArray - 3D array descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMalloc3DArray + */ +CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray); + +/** + * \brief Get a 3D CUDA array descriptor + * + * Returns in \p *pArrayDescriptor a descriptor containing information on the + * format and dimensions of the CUDA array \p hArray. It is useful for + * subroutines that have been passed a CUDA array, but need to know the CUDA + * array parameters for validation or other purposes. + * + * This function may be called on 1D and 2D arrays, in which case the \p Height + * and/or \p Depth members of the descriptor struct will be set to 0. + * + * \param pArrayDescriptor - Returned 3D array descriptor + * \param hArray - 3D array to get descriptor of + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_CONTEXT_IS_DESTROYED + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaArrayGetInfo + */ +CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray); +#endif /* __CUDA_API_VERSION >= 3020 */ + +#if __CUDA_API_VERSION >= 5000 + +/** + * \brief Creates a CUDA mipmapped array + * + * Creates a CUDA mipmapped array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure + * \p pMipmappedArrayDesc and returns a handle to the new CUDA mipmapped array in \p *pHandle. + * \p numMipmapLevels specifies the number of mipmap levels to be allocated. This value is + * clamped to the range [1, 1 + floor(log2(max(width, height, depth)))]. + * + * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as: + * + * \code + typedef struct { + unsigned int Width; + unsigned int Height; + unsigned int Depth; + CUarray_format Format; + unsigned int NumChannels; + unsigned int Flags; + } CUDA_ARRAY3D_DESCRIPTOR; + * \endcode + * where: + * + * - \p Width, \p Height, and \p Depth are the width, height, and depth of the + * CUDA array (in elements); the following types of CUDA arrays can be allocated: + * - A 1D mipmapped array is allocated if \p Height and \p Depth extents are both zero. + * - A 2D mipmapped array is allocated if only \p Depth extent is zero. + * - A 3D mipmapped array is allocated if all three extents are non-zero. + * - A 1D layered CUDA mipmapped array is allocated if only \p Height is zero and the + * ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number + * of layers is determined by the depth extent. + * - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and + * the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number + * of layers is determined by the depth extent. + * - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the + * ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and + * \p Depth must be six. A cubemap is a special type of 2D layered CUDA array, + * where the six layers represent the six faces of a cube. The order of the six + * layers in memory is the same as that listed in ::CUarray_cubemap_face. + * - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero, + * and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set. + * \p Width must be equal to \p Height, and \p Depth must be a multiple of six. + * A cubemap layered CUDA array is a special type of 2D layered CUDA array that + * consists of a collection of cubemaps. The first six layers represent the first + * cubemap, the next six layers form the second cubemap, and so on. + * + * - ::Format specifies the format of the elements; ::CUarray_format is + * defined as: + * \code + typedef enum CUarray_format_enum { + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, + CU_AD_FORMAT_SIGNED_INT8 = 0x08, + CU_AD_FORMAT_SIGNED_INT16 = 0x09, + CU_AD_FORMAT_SIGNED_INT32 = 0x0a, + CU_AD_FORMAT_HALF = 0x10, + CU_AD_FORMAT_FLOAT = 0x20 + } CUarray_format; + * \endcode + * + * - \p NumChannels specifies the number of packed components per CUDA array + * element; it may be 1, 2, or 4; + * + * - ::Flags may be set to + * - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA mipmapped arrays. If this flag is set, + * \p Depth specifies the number of layers, not the depth of a 3D array. + * - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to individual mipmap levels of + * the CUDA mipmapped array. If this flag is not set, ::cuSurfRefSetArray will fail when attempting to + * bind a mipmap level of the CUDA mipmapped array to a surface reference. + * - ::CUDA_ARRAY3D_CUBEMAP to enable creation of mipmapped cubemaps. If this flag is set, \p Width must be + * equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set, + * then \p Depth must be a multiple of six. + * - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA mipmapped array will be used for texture gather. + * Texture gather can only be performed on 2D CUDA mipmapped arrays. + * + * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table. + * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute + * is not specified. For ex., TEXTURE1D_MIPMAPPED_WIDTH refers to the device attribute + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH. + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
CUDA array typeValid extents that must always be met
{(width range in elements), (height range), + * (depth range)}
Valid extents with CUDA_ARRAY3D_SURFACE_LDST set
+ * {(width range in elements), (height range), (depth range)}
1D{ (1,TEXTURE1D_MIPMAPPED_WIDTH), 0, 0 }{ (1,SURFACE1D_WIDTH), 0, 0 }
2D{ (1,TEXTURE2D_MIPMAPPED_WIDTH), (1,TEXTURE2D_MIPMAPPED_HEIGHT), 0 }{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }
3D{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) } + *
OR
{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), + * (1,TEXTURE3D_DEPTH_ALTERNATE) }
{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT), + * (1,SURFACE3D_DEPTH) }
1D Layered{ (1,TEXTURE1D_LAYERED_WIDTH), 0, + * (1,TEXTURE1D_LAYERED_LAYERS) }{ (1,SURFACE1D_LAYERED_WIDTH), 0, + * (1,SURFACE1D_LAYERED_LAYERS) }
2D Layered{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), + * (1,TEXTURE2D_LAYERED_LAYERS) }{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT), + * (1,SURFACE2D_LAYERED_LAYERS) }
Cubemap{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }{ (1,SURFACECUBEMAP_WIDTH), + * (1,SURFACECUBEMAP_WIDTH), 6 }
Cubemap Layered{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), + * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH), + * (1,SURFACECUBEMAP_LAYERED_LAYERS) }
+ * + * + * \param pHandle - Returned mipmapped array + * \param pMipmappedArrayDesc - mipmapped array descriptor + * \param numMipmapLevels - Number of mipmap levels + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuMipmappedArrayDestroy, + * ::cuMipmappedArrayGetLevel, + * ::cuArrayCreate, + * ::cudaMallocMipmappedArray + */ +CUresult CUDAAPI cuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels); + +/** + * \brief Gets a mipmap level of a CUDA mipmapped array + * + * Returns in \p *pLevelArray a CUDA array that represents a single mipmap level + * of the CUDA mipmapped array \p hMipmappedArray. + * + * If \p level is greater than the maximum number of levels in this mipmapped array, + * ::CUDA_ERROR_INVALID_VALUE is returned. + * + * \param pLevelArray - Returned mipmap level CUDA array + * \param hMipmappedArray - CUDA mipmapped array + * \param level - Mipmap level + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa + * ::cuMipmappedArrayCreate, + * ::cuMipmappedArrayDestroy, + * ::cuArrayCreate, + * ::cudaGetMipmappedArrayLevel + */ +CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level); + +/** + * \brief Destroys a CUDA mipmapped array + * + * Destroys the CUDA mipmapped array \p hMipmappedArray. + * + * \param hMipmappedArray - Mipmapped array to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ARRAY_IS_MAPPED, + * ::CUDA_ERROR_CONTEXT_IS_DESTROYED + * \notefnerr + * + * \sa + * ::cuMipmappedArrayCreate, + * ::cuMipmappedArrayGetLevel, + * ::cuArrayCreate, + * ::cudaFreeMipmappedArray + */ +CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray); + +#endif /* __CUDA_API_VERSION >= 5000 */ + +/** @} */ /* END CUDA_MEM */ + +/** + * \defgroup CUDA_UNIFIED Unified Addressing + * + * ___MANBRIEF___ unified addressing functions of the low-level CUDA driver + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the unified addressing functions of the + * low-level CUDA driver application programming interface. + * + * @{ + * + * \section CUDA_UNIFIED_overview Overview + * + * CUDA devices can share a unified address space with the host. + * For these devices there is no distinction between a device + * pointer and a host pointer -- the same pointer value may be + * used to access memory from the host program and from a kernel + * running on the device (with exceptions enumerated below). + * + * \section CUDA_UNIFIED_support Supported Platforms + * + * Whether or not a device supports unified addressing may be + * queried by calling ::cuDeviceGetAttribute() with the device + * attribute ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. + * + * Unified addressing is automatically enabled in 64-bit processes + * + * \section CUDA_UNIFIED_lookup Looking Up Information from Pointer Values + * + * It is possible to look up information about the memory which backs a + * pointer value. For instance, one may want to know if a pointer points + * to host or device memory. As another example, in the case of device + * memory, one may want to know on which CUDA device the memory + * resides. These properties may be queried using the function + * ::cuPointerGetAttribute() + * + * Since pointers are unique, it is not necessary to specify information + * about the pointers specified to the various copy functions in the + * CUDA API. The function ::cuMemcpy() may be used to perform a copy + * between two pointers, ignoring whether they point to host or device + * memory (making ::cuMemcpyHtoD(), ::cuMemcpyDtoD(), and ::cuMemcpyDtoH() + * unnecessary for devices supporting unified addressing). For + * multidimensional copies, the memory type ::CU_MEMORYTYPE_UNIFIED may be + * used to specify that the CUDA driver should infer the location of the + * pointer from its value. + * + * \section CUDA_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory + * + * All host memory allocated in all contexts using ::cuMemAllocHost() and + * ::cuMemHostAlloc() is always directly accessible from all contexts on + * all devices that support unified addressing. This is the case regardless + * of whether or not the flags ::CU_MEMHOSTALLOC_PORTABLE and + * ::CU_MEMHOSTALLOC_DEVICEMAP are specified. + * + * The pointer value through which allocated host memory may be accessed + * in kernels on all devices that support unified addressing is the same + * as the pointer value through which that memory is accessed on the host, + * so it is not necessary to call ::cuMemHostGetDevicePointer() to get the device + * pointer for these allocations. + * + * Note that this is not the case for memory allocated using the flag + * ::CU_MEMHOSTALLOC_WRITECOMBINED, as discussed below. + * + * \section CUDA_UNIFIED_autopeerregister Automatic Registration of Peer Memory + * + * Upon enabling direct access from a context that supports unified addressing + * to another peer context that supports unified addressing using + * ::cuCtxEnablePeerAccess() all memory allocated in the peer context using + * ::cuMemAlloc() and ::cuMemAllocPitch() will immediately be accessible + * by the current context. The device pointer value through + * which any peer memory may be accessed in the current context + * is the same pointer value through which that memory may be + * accessed in the peer context. + * + * \section CUDA_UNIFIED_exceptions Exceptions, Disjoint Addressing + * + * Not all memory may be accessed on devices through the same pointer + * value through which they are accessed on the host. These exceptions + * are host memory registered using ::cuMemHostRegister() and host memory + * allocated using the flag ::CU_MEMHOSTALLOC_WRITECOMBINED. For these + * exceptions, there exists a distinct host and device address for the + * memory. The device address is guaranteed to not overlap any valid host + * pointer range and is guaranteed to have the same value across all + * contexts that support unified addressing. + * + * This device address may be queried using ::cuMemHostGetDevicePointer() + * when a context using unified addressing is current. Either the host + * or the unified device pointer value may be used to refer to this memory + * through ::cuMemcpy() and similar functions using the + * ::CU_MEMORYTYPE_UNIFIED memory type. + * + */ + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Returns information about a pointer + * + * The supported attributes are: + * + * - ::CU_POINTER_ATTRIBUTE_CONTEXT: + * + * Returns in \p *data the ::CUcontext in which \p ptr was allocated or + * registered. + * The type of \p data must be ::CUcontext *. + * + * If \p ptr was not allocated by, mapped by, or registered with + * a ::CUcontext which uses unified virtual addressing then + * ::CUDA_ERROR_INVALID_VALUE is returned. + * + * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE: + * + * Returns in \p *data the physical memory type of the memory that + * \p ptr addresses as a ::CUmemorytype enumerated value. + * The type of \p data must be unsigned int. + * + * If \p ptr addresses device memory then \p *data is set to + * ::CU_MEMORYTYPE_DEVICE. The particular ::CUdevice on which the + * memory resides is the ::CUdevice of the ::CUcontext returned by the + * ::CU_POINTER_ATTRIBUTE_CONTEXT attribute of \p ptr. + * + * If \p ptr addresses host memory then \p *data is set to + * ::CU_MEMORYTYPE_HOST. + * + * If \p ptr was not allocated by, mapped by, or registered with + * a ::CUcontext which uses unified virtual addressing then + * ::CUDA_ERROR_INVALID_VALUE is returned. + * + * If the current ::CUcontext does not support unified virtual + * addressing then ::CUDA_ERROR_INVALID_CONTEXT is returned. + * + * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER: + * + * Returns in \p *data the device pointer value through which + * \p ptr may be accessed by kernels running in the current + * ::CUcontext. + * The type of \p data must be CUdeviceptr *. + * + * If there exists no device pointer value through which + * kernels running in the current ::CUcontext may access + * \p ptr then ::CUDA_ERROR_INVALID_VALUE is returned. + * + * If there is no current ::CUcontext then + * ::CUDA_ERROR_INVALID_CONTEXT is returned. + * + * Except in the exceptional disjoint addressing cases discussed + * below, the value returned in \p *data will equal the input + * value \p ptr. + * + * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER: + * + * Returns in \p *data the host pointer value through which + * \p ptr may be accessed by by the host program. + * The type of \p data must be void **. + * If there exists no host pointer value through which + * the host program may directly access \p ptr then + * ::CUDA_ERROR_INVALID_VALUE is returned. + * + * Except in the exceptional disjoint addressing cases discussed + * below, the value returned in \p *data will equal the input + * value \p ptr. + * + * - ::CU_POINTER_ATTRIBUTE_P2P_TOKENS: + * + * Returns in \p *data two tokens for use with the nv-p2p.h Linux + * kernel interface. \p data must be a struct of type + * CUDA_POINTER_ATTRIBUTE_P2P_TOKENS. + * + * \p ptr must be a pointer to memory obtained from :cuMemAlloc(). + * Note that p2pToken and vaSpaceToken are only valid for the + * lifetime of the source allocation. A subsequent allocation at + * the same address may return completely different tokens. + * Querying this attribute has a side effect of setting the attribute + * ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS for the region of memory that + * \p ptr points to. + * + * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS: + * + * A boolean attribute which when set, ensures that synchronous memory operations + * initiated on the region of memory that \p ptr points to will always synchronize. + * See further documentation in the section titled "API synchronization behavior" + * to learn more about cases when synchronous memory operations can + * exhibit asynchronous behavior. + * + * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID: + * + * Returns in \p *data a buffer ID which is guaranteed to be unique within the process. + * \p data must point to an unsigned long long. + * + * \p ptr must be a pointer to memory obtained from a CUDA memory allocation API. + * Every memory allocation from any of the CUDA memory allocation APIs will + * have a unique ID over a process lifetime. Subsequent allocations do not reuse IDs + * from previous freed allocations. IDs are only unique within a single process. + * + * + * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED: + * + * Returns in \p *data a boolean that indicates whether the pointer points to + * managed memory or not. + * + * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL: + * + * Returns in \p *data an integer representing a device ordinal of a device against + * which the memory was allocated or registered. + * + * \par + * + * Note that for most allocations in the unified virtual address space + * the host and device pointer for accessing the allocation will be the + * same. The exceptions to this are + * - user memory registered using ::cuMemHostRegister + * - host memory allocated using ::cuMemHostAlloc with the + * ::CU_MEMHOSTALLOC_WRITECOMBINED flag + * For these types of allocation there will exist separate, disjoint host + * and device addresses for accessing the allocation. In particular + * - The host address will correspond to an invalid unmapped device address + * (which will result in an exception if accessed from the device) + * - The device address will correspond to an invalid unmapped host address + * (which will result in an exception if accessed from the host). + * For these types of allocations, querying ::CU_POINTER_ATTRIBUTE_HOST_POINTER + * and ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER may be used to retrieve the host + * and device addresses from either address. + * + * \param data - Returned pointer attribute value + * \param attribute - Pointer attribute to query + * \param ptr - Pointer + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuPointerSetAttribute, + * ::cuMemAlloc, + * ::cuMemFree, + * ::cuMemAllocHost, + * ::cuMemFreeHost, + * ::cuMemHostAlloc, + * ::cuMemHostRegister, + * ::cuMemHostUnregister, + * ::cudaPointerGetAttributes + */ +CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr); +#endif /* __CUDA_API_VERSION >= 4000 */ + +#if __CUDA_API_VERSION >= 8000 +/** + * \brief Prefetches memory to the specified destination device + * + * Prefetches memory to the specified destination device. \p devPtr is the + * base device pointer of the memory to be prefetched and \p dstDevice is the + * destination device. \p count specifies the number of bytes to copy. \p hStream + * is the stream in which the operation is enqueued. The memory range must refer + * to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + * + * Passing in CU_DEVICE_CPU for \p dstDevice will prefetch the data to host memory. If + * \p dstDevice is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS + * must be non-zero. Additionally, \p hStream must be associated with a device that has a + * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + * + * The start address and end address of the memory range will be rounded down and rounded up + * respectively to be aligned to CPU page size before the prefetch operation is enqueued + * in the stream. + * + * If no physical memory has been allocated for this region, then this memory region + * will be populated and mapped on the destination device. If there's insufficient + * memory to prefetch the desired region, the Unified Memory driver may evict pages from other + * ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + * allocated using ::cuMemAlloc or ::cuArrayCreate will not be evicted. + * + * By default, any mappings to the previous location of the migrated pages are removed and + * mappings for the new location are only setup on \p dstDevice. The exact behavior however + * also depends on the settings applied to this memory range via ::cuMemAdvise as described + * below: + * + * If ::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + * then that subset will create a read-only copy of the pages on \p dstDevice. + * + * If ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + * range, then the pages will be migrated to \p dstDevice even if \p dstDevice is not the + * preferred location of any pages in the memory range. + * + * If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + * then mappings to those pages from all the appropriate processors are updated to + * refer to the new location if establishing such a mapping is possible. Otherwise, + * those mappings are cleared. + * + * Note that this API is not required for functionality and only serves to improve performance + * by allowing the application to migrate data to a suitable location before it is accessed. + * Memory accesses to this range are always coherent and are allowed even when the data is + * actively being migrated. + * + * Note that this function is asynchronous with respect to the host and all work + * on other devices. + * + * \param devPtr - Pointer to be prefetched + * \param count - Size in bytes + * \param dstDevice - Destination device to prefetch to + * \param hStream - Stream to enqueue prefetch operation + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync, + * ::cuMemcpy3DPeerAsync, ::cuMemAdvise, + * ::cudaMemPrefetchAsync + */ +CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream); + +/** + * \brief Advise about the usage of a given memory range + * + * Advise the Unified Memory subsystem about the usage pattern for the memory range + * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory + * range will be rounded down and rounded up respectively to be aligned to CPU page size before the + * advice is applied. The memory range must refer to managed memory allocated via ::cuMemAllocManaged + * or declared via __managed__ variables. The memory range could also refer to system-allocated pageable + * memory provided it represents a valid, host-accessible region of memory and all additional constraints + * imposed by \p advice as outlined below are also satisfied. Specifying an invalid system-allocated pageable + * memory range results in an error being returned. + * + * The \p advice parameter can take the following values: + * - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + * from and only occasionally written to. Any read accesses from any processor to this region will create a + * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + * is called on this region, it will create a read-only copy of the data on the destination processor. + * If any processor writes to this region, all copies of the corresponding page will be invalidated + * except for the one where the write occurred. The \p device argument is ignored for this advice. + * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + * that has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + * Also, if a context is created on a device that does not have the device attribute + * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + * all such contexts are destroyed. + * If the memory region refers to valid system-allocated pageable memory, then the accessing device must + * have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + * copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + * device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + * will not create a read-only copy when that device accesses this memory region. + * + * - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated + * copies of the data will be collapsed into a single copy. The location for the collapsed + * copy will be the preferred location if the page has a preferred location and one of the read-duplicated + * copies was resident at that location. Otherwise, the location chosen is arbitrary. + * + * - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + * data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + * preferred location as host memory. If \p device is a GPU, then it must have a non-zero value for the + * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Setting the preferred location + * does not cause data to migrate to that location immediately. Instead, it guides the migration policy + * when a fault occurs on that memory region. If the data is already in its preferred location and the + * faulting processor can establish a mapping without requiring the data to be migrated, then + * data migration will be avoided. On the other hand, if the data is not in its preferred location + * or if a direct mapping cannot be established, then it will be migrated to the processor accessing + * it. It is important to note that setting the preferred location does not prevent data prefetching + * done using ::cuMemPrefetchAsync. + * Having a preferred location can override the page thrash detection and resolution logic in the Unified + * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device + * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But + * if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + * policies associated with that advice will override the policies of this advice, unless read accesses from + * \p device will not result in a read-only copy being created on that device as outlined in description for + * the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero + * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has + * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + * then this call has no effect. Note however that this behavior may change in the future. + * + * - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + * and changes the preferred location to none. + * + * - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + * Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. If \p device is a GPU, then + * the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + * This advice does not cause data migration and has no impact on the location of the data per se. Instead, + * it causes the data to always be mapped in the specified processor's page tables, as long as the + * location of the data permits a mapping to be established. If the data gets migrated for any reason, + * the mappings are updated accordingly. + * This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data + * over to the other GPUs is not as important because the accesses are infrequent and the overhead of + * migration may be too high. But preventing faults can still help improve performance, and so having + * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the + * ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + * page in host memory. + * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + * policies associated with that advice will override the policies of this advice. Additionally, if the + * preferred location of this memory region or any subset of it is also \p device, then the policies + * associated with ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero + * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has + * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + * then this call has no effect. + * + * - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY. Any mappings to + * the data from \p device may be removed at any time causing accesses to result in non-fatal page faults. + * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero + * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has + * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + * then this call has no effect. + * + * \param devPtr - Pointer to memory to set the advice for + * \param count - Size in bytes of the memory range + * \param advice - Advice to be applied for the specified memory range + * \param device - Device to apply the advice for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync, + * ::cuMemcpy3DPeerAsync, ::cuMemPrefetchAsync, + * ::cudaMemAdvise + */ +CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device); + +/** + * \brief Query an attribute of a given memory range + * + * Query an attribute about the memory range starting at \p devPtr with a size of \p count bytes. The + * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via + * __managed__ variables. + * + * The \p attribute parameter can take the following values: + * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY: If this attribute is specified, \p data will be interpreted + * as a 32-bit integer, and \p dataSize must be 4. The result returned will be 1 if all pages in the given + * memory range have read-duplication enabled, or 0 otherwise. + * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION: If this attribute is specified, \p data will be + * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be a GPU device + * id if all pages in the memory range have that GPU as their preferred location, or it will be CU_DEVICE_CPU + * if all pages in the memory range have the CPU as their preferred location, or it will be CU_DEVICE_INVALID + * if either all the pages don't have the same preferred location or some of the pages don't have a + * preferred location at all. Note that the actual location of the pages in the memory range at the time of + * the query may be different from the preferred location. + * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY: If this attribute is specified, \p data will be interpreted + * as an array of 32-bit integers, and \p dataSize must be a non-zero multiple of 4. The result returned + * will be a list of device ids that had ::CU_MEM_ADVISE_SET_ACCESSED_BY set for that entire memory range. + * If any device does not have that advice set for the entire memory range, that device will not be included. + * If \p data is larger than the number of devices that have that advice set for that memory range, + * CU_DEVICE_INVALID will be returned in all the extra space provided. For ex., if \p dataSize is 12 + * (i.e. \p data has 3 elements) and only device 0 has the advice set, then the result returned will be + * { 0, CU_DEVICE_INVALID, CU_DEVICE_INVALID }. If \p data is smaller than the number of devices that have + * that advice set, then only as many devices will be returned as can fit in the array. There is no + * guarantee on which specific devices will be returned, however. + * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION: If this attribute is specified, \p data will be + * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be the last location + * to which all pages in the memory range were prefetched explicitly via ::cuMemPrefetchAsync. This will either be + * a GPU id or CU_DEVICE_CPU depending on whether the last location for prefetch was a GPU or the CPU + * respectively. If any page in the memory range was never explicitly prefetched or if all pages were not + * prefetched to the same location, CU_DEVICE_INVALID will be returned. Note that this simply returns the + * last location that the applicaton requested to prefetch the memory range to. It gives no indication as to + * whether the prefetch operation to that location has completed or even begun. + * + * \param data - A pointers to a memory location where the result + * of each attribute query will be written to. + * \param dataSize - Array containing the size of data + * \param attribute - The attribute to query + * \param devPtr - Start of the range to query + * \param count - Size of the range to query + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuMemRangeGetAttributes, ::cuMemPrefetchAsync, + * ::cuMemAdvise, + * ::cudaMemRangeGetAttribute + */ +CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr devPtr, size_t count); + +/** + * \brief Query attributes of a given memory range. + * + * Query attributes of the memory range starting at \p devPtr with a size of \p count bytes. The + * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via + * __managed__ variables. The \p attributes array will be interpreted to have \p numAttributes + * entries. The \p dataSizes array will also be interpreted to have \p numAttributes entries. + * The results of the query will be stored in \p data. + * + * The list of supported attributes are given below. Please refer to ::cuMemRangeGetAttribute for + * attribute descriptions and restrictions. + * + * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY + * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION + * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY + * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION + * + * \param data - A two-dimensional array containing pointers to memory + * locations where the result of each attribute query will be written to. + * \param dataSizes - Array containing the sizes of each result + * \param attributes - An array of attributes to query + * (numAttributes and the number of attributes in this array should match) + * \param numAttributes - Number of attributes to query + * \param devPtr - Start of the range to query + * \param count - Size of the range to query + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa ::cuMemRangeGetAttribute, ::cuMemAdvise + * ::cuMemPrefetchAsync, + * ::cudaMemRangeGetAttributes + */ +CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr devPtr, size_t count); +#endif /* __CUDA_API_VERSION >= 8000 */ + +#if __CUDA_API_VERSION >= 6000 +/** + * \brief Set attributes on a previously allocated memory region + * + * The supported attributes are: + * + * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS: + * + * A boolean attribute that can either be set (1) or unset (0). When set, + * the region of memory that \p ptr points to is guaranteed to always synchronize + * memory operations that are synchronous. If there are some previously initiated + * synchronous memory operations that are pending when this attribute is set, the + * function does not return until those memory operations are complete. + * See further documentation in the section titled "API synchronization behavior" + * to learn more about cases when synchronous memory operations can + * exhibit asynchronous behavior. + * \p value will be considered as a pointer to an unsigned integer to which this attribute is to be set. + * + * \param value - Pointer to memory containing the value to be set + * \param attribute - Pointer attribute to set + * \param ptr - Pointer to a memory region allocated using CUDA memory allocation APIs + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa ::cuPointerGetAttribute, + * ::cuPointerGetAttributes, + * ::cuMemAlloc, + * ::cuMemFree, + * ::cuMemAllocHost, + * ::cuMemFreeHost, + * ::cuMemHostAlloc, + * ::cuMemHostRegister, + * ::cuMemHostUnregister + */ +CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute attribute, CUdeviceptr ptr); +#endif /* __CUDA_API_VERSION >= 6000 */ + +#if __CUDA_API_VERSION >= 7000 +/** + * \brief Returns information about a pointer. + * + * The supported attributes are (refer to ::cuPointerGetAttribute for attribute descriptions and restrictions): + * + * - ::CU_POINTER_ATTRIBUTE_CONTEXT + * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE + * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER + * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER + * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS + * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID + * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED + * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL + * + * \param numAttributes - Number of attributes to query + * \param attributes - An array of attributes to query + * (numAttributes and the number of attributes in this array should match) + * \param data - A two-dimensional array containing pointers to memory + * locations where the result of each attribute query will be written to. + * \param ptr - Pointer to query + * + * Unlike ::cuPointerGetAttribute, this function will not return an error when the \p ptr + * encountered is not a valid CUDA pointer. Instead, the attributes are assigned default NULL values + * and CUDA_SUCCESS is returned. + * + * If \p ptr was not allocated by, mapped by, or registered with a ::CUcontext which uses UVA + * (Unified Virtual Addressing), ::CUDA_ERROR_INVALID_CONTEXT is returned. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuPointerGetAttribute, + * ::cuPointerSetAttribute, + * ::cudaPointerGetAttributes + */ +CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr ptr); +#endif /* __CUDA_API_VERSION >= 7000 */ + +/** @} */ /* END CUDA_UNIFIED */ + +/** + * \defgroup CUDA_STREAM Stream Management + * + * ___MANBRIEF___ stream management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the stream management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Create a stream + * + * Creates a stream and returns a handle in \p phStream. The \p Flags argument + * determines behaviors of the stream. Valid values for \p Flags are: + * - ::CU_STREAM_DEFAULT: Default stream creation flag. + * - ::CU_STREAM_NON_BLOCKING: Specifies that work running in the created + * stream may run concurrently with work in stream 0 (the NULL stream), and that + * the created stream should perform no implicit synchronization with stream 0. + * + * \param phStream - Returned newly created stream + * \param Flags - Parameters for stream creation + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuStreamDestroy, + * ::cuStreamCreateWithPriority, + * ::cuStreamGetPriority, + * ::cuStreamGetFlags, + * ::cuStreamWaitEvent, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback, + * ::cudaStreamCreate, + * ::cudaStreamCreateWithFlags + */ +CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags); + +/** + * \brief Create a stream with the given priority + * + * Creates a stream with the specified priority and returns a handle in \p phStream. + * This API alters the scheduler priority of work in the stream. Work in a higher + * priority stream may preempt work already executing in a low priority stream. + * + * \p priority follows a convention where lower numbers represent higher priorities. + * '0' represents default priority. The range of meaningful numerical priorities can + * be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is + * outside the numerical range returned by ::cuCtxGetStreamPriorityRange, + * it will automatically be clamped to the lowest or the highest number in the range. + * + * \param phStream - Returned newly created stream + * \param flags - Flags for stream creation. See ::cuStreamCreate for a list of + * valid flags + * \param priority - Stream priority. Lower numbers represent higher priorities. + * See ::cuCtxGetStreamPriorityRange for more information about + * meaningful stream priorities that can be passed. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \note Stream priorities are supported only on GPUs + * with compute capability 3.5 or higher. + * + * \note In the current implementation, only compute kernels launched in + * priority streams are affected by the stream's priority. Stream priorities have + * no effect on host-to-device and device-to-host memory operations. + * + * \sa ::cuStreamDestroy, + * ::cuStreamCreate, + * ::cuStreamGetPriority, + * ::cuCtxGetStreamPriorityRange, + * ::cuStreamGetFlags, + * ::cuStreamWaitEvent, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback, + * ::cudaStreamCreateWithPriority + */ +CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int flags, int priority); + + +/** + * \brief Query the priority of a given stream + * + * Query the priority of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority + * and return the priority in \p priority. Note that if the stream was created with a + * priority outside the numerical range returned by ::cuCtxGetStreamPriorityRange, + * this function returns the clamped priority. + * See ::cuStreamCreateWithPriority for details about priority clamping. + * + * \param hStream - Handle to the stream to be queried + * \param priority - Pointer to a signed integer in which the stream's priority is returned + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuStreamDestroy, + * ::cuStreamCreate, + * ::cuStreamCreateWithPriority, + * ::cuCtxGetStreamPriorityRange, + * ::cuStreamGetFlags, + * ::cudaStreamGetPriority + */ +CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority); + +/** + * \brief Query the flags of a given stream + * + * Query the flags of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority + * and return the flags in \p flags. + * + * \param hStream - Handle to the stream to be queried + * \param flags - Pointer to an unsigned integer in which the stream's flags are returned + * The value returned in \p flags is a logical 'OR' of all flags that + * were used while creating this stream. See ::cuStreamCreate for the list + * of valid flags + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuStreamDestroy, + * ::cuStreamCreate, + * ::cuStreamGetPriority, + * ::cudaStreamGetFlags + */ +CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags); + +#if __CUDA_API_VERSION >= 9020 + +/** + * \brief Query the context associated with a stream + * + * Returns the CUDA context that the stream is associated with. + * + * The stream handle \p hStream can refer to any of the following: + *

    + *
  • a stream created via any of the CUDA driver APIs such as ::cuStreamCreate + * and ::cuStreamCreateWithPriority, or their runtime API equivalents such as + * ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority. + * The returned context is the context that was active in the calling thread when the + * stream was created. Passing an invalid handle will result in undefined behavior.
  • + *
  • any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and + * ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted, + * which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively. + * Specifying any of the special handles will return the context current to the + * calling thread. If no context is current to the calling thread, + * ::CUDA_ERROR_INVALID_CONTEXT is returned.
  • + *
+ * + * \param hStream - Handle to the stream to be queried + * \param pctx - Returned context associated with the stream + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * \notefnerr + * + * \sa ::cuStreamDestroy, + * ::cuStreamCreateWithPriority, + * ::cuStreamGetPriority, + * ::cuStreamGetFlags, + * ::cuStreamWaitEvent, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback, + * ::cudaStreamCreate, + * ::cudaStreamCreateWithFlags + */ +CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx); + +#endif /* __CUDA_API_VERSION >= 9020 */ + +/** + * \brief Make a compute stream wait on an event + * + * Makes all future work submitted to \p hStream wait for all work captured in + * \p hEvent. See ::cuEventRecord() for details on what is captured by an event. + * The synchronization will be performed efficiently on the device when applicable. + * \p hEvent may be from a different context or device than \p hStream. + * + * \param hStream - Stream to wait + * \param hEvent - Event to wait on (may not be NULL) + * \param Flags - Parameters for the operation (must be 0) + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuEventRecord, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback, + * ::cuStreamDestroy, + * ::cudaStreamWaitEvent + */ +CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags); + +/** + * \brief Add a callback to a compute stream + * + * \note This function is slated for eventual deprecation and removal. If + * you do not require the callback to execute in case of a device error, + * consider using ::cuLaunchHostFunc. Additionally, this function is not + * supported with ::cuStreamBeginCapture and ::cuStreamEndCapture, unlike + * ::cuLaunchHostFunc. + * + * Adds a callback to be called on the host after all currently enqueued + * items in the stream have completed. For each + * cuStreamAddCallback call, the callback will be executed exactly once. + * The callback will block later work in the stream until it is finished. + * + * The callback may be passed ::CUDA_SUCCESS or an error code. In the event + * of a device error, all subsequently executed callbacks will receive an + * appropriate ::CUresult. + * + * Callbacks must not make any CUDA API calls. Attempting to use a CUDA API + * will result in ::CUDA_ERROR_NOT_PERMITTED. Callbacks must not perform any + * synchronization that may depend on outstanding device work or other callbacks + * that are not mandated to run earlier. Callbacks without a mandated order + * (in independent streams) execute in undefined order and may be serialized. + * + * For the purposes of Unified Memory, callback execution makes a number of + * guarantees: + *
    + *
  • The callback stream is considered idle for the duration of the + * callback. Thus, for example, a callback may always use memory attached + * to the callback stream.
  • + *
  • The start of execution of a callback has the same effect as + * synchronizing an event recorded in the same stream immediately prior to + * the callback. It thus synchronizes streams which have been "joined" + * prior to the callback.
  • + *
  • Adding device work to any stream does not have the effect of making + * the stream active until all preceding host functions and stream callbacks + * have executed. Thus, for + * example, a callback might use global attached memory even if work has + * been added to another stream, if the work has been ordered behind the + * callback with an event.
  • + *
  • Completion of a callback does not cause a stream to become + * active except as described above. The callback stream will remain idle + * if no device work follows the callback, and will remain idle across + * consecutive callbacks without device work in between. Thus, for example, + * stream synchronization can be done by signaling from a callback at the + * end of the stream.
  • + *
+ * + * \param hStream - Stream to add callback to + * \param callback - The function to call once preceding stream operations are complete + * \param userData - User specified data to be passed to the callback function + * \param flags - Reserved for future use, must be 0 + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamWaitEvent, + * ::cuStreamDestroy, + * ::cuMemAllocManaged, + * ::cuStreamAttachMemAsync, + * ::cuStreamLaunchHostFunc, + * ::cudaStreamAddCallback + */ +CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags); + +#if __CUDA_API_VERSION >= 10000 + +/** + * \brief Begins graph capture on a stream + * + * Begin graph capture on \p hStream. When a stream is in capture mode, all operations + * pushed into the stream will not be executed, but will instead be captured into + * a graph, which will be returned via ::cuStreamEndCapture. Capture may not be initiated + * if \p stream is CU_STREAM_LEGACY. Capture must be ended on the same stream in which + * it was initiated, and it may only be initiated if the stream is not already in capture + * mode. The capture mode may be queried via ::cuStreamIsCapturing. A unique id + * representing the capture sequence may be queried via ::cuStreamGetCaptureInfo. + * + * If \p mode is not ::CU_STREAM_CAPTURE_MODE_RELAXED, ::cuStreamEndCapture must be + * called on this stream from the same thread. + * + * \param hStream - Stream in which to initiate capture + * \param mode - Controls the interaction of this capture sequence with other API + * calls that are potentially unsafe. For more details see + * ::cuThreadExchangeStreamCaptureMode. + * + * \note Kernels captured using this API must not use texture and surface references. + * Reading or writing through any texture or surface reference is undefined + * behavior. This restriction does not apply to texture and surface objects. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cuStreamCreate, + * ::cuStreamIsCapturing, + * ::cuStreamEndCapture, + * ::cuThreadExchangeStreamCaptureMode + */ +CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream, CUstreamCaptureMode mode); + +#endif /* __CUDA_API_VERSION >= 10000 */ +#if __CUDA_API_VERSION >= 10010 + +/** + * \brief Swaps the stream capture interaction mode for a thread + * + * Sets the calling thread's stream capture interaction mode to the value contained + * in \p *mode, and overwrites \p *mode with the previous mode for the thread. To + * facilitate deterministic behavior across function or module boundaries, callers + * are encouraged to use this API in a push-pop fashion: \code + CUstreamCaptureMode mode = desiredMode; + cuThreadExchangeStreamCaptureMode(&mode); + ... + cuThreadExchangeStreamCaptureMode(&mode); // restore previous mode + * \endcode + * + * During stream capture (see ::cuStreamBeginCapture), some actions, such as a call + * to ::cudaMalloc, may be unsafe. In the case of ::cudaMalloc, the operation is + * not enqueued asynchronously to a stream, and is not observed by stream capture. + * Therefore, if the sequence of operations captured via ::cuStreamBeginCapture + * depended on the allocation being replayed whenever the graph is launched, the + * captured graph would be invalid. + * + * Therefore, stream capture places restrictions on API calls that can be made within + * or concurrently to a ::cuStreamBeginCapture-::cuStreamEndCapture sequence. This + * behavior can be controlled via this API and flags to ::cuStreamBeginCapture. + * + * A thread's mode is one of the following: + * - \p CU_STREAM_CAPTURE_MODE_GLOBAL: This is the default mode. If the local thread has + * an ongoing capture sequence that was not initiated with + * \p CU_STREAM_CAPTURE_MODE_RELAXED at \p cuStreamBeginCapture, or if any other thread + * has a concurrent capture sequence initiated with \p CU_STREAM_CAPTURE_MODE_GLOBAL, + * this thread is prohibited from potentially unsafe API calls. + * - \p CU_STREAM_CAPTURE_MODE_THREAD_LOCAL: If the local thread has an ongoing capture + * sequence not initiated with \p CU_STREAM_CAPTURE_MODE_RELAXED, it is prohibited + * from potentially unsafe API calls. Concurrent capture sequences in other threads + * are ignored. + * - \p CU_STREAM_CAPTURE_MODE_RELAXED: The local thread is not prohibited from potentially + * unsafe API calls. Note that the thread is still prohibited from API calls which + * necessarily conflict with stream capture, for example, attempting ::cuEventQuery + * on an event that was last recorded inside a capture sequence. + * + * \param mode - Pointer to mode value to swap with the current mode + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cuStreamBeginCapture + */ +CUresult CUDAAPI cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode *mode); + +#endif /* __CUDA_API_VERSION >= 10010 */ +#if __CUDA_API_VERSION >= 10000 + +/** + * \brief Ends capture on a stream, returning the captured graph + * + * End capture on \p hStream, returning the captured graph via \p phGraph. + * Capture must have been initiated on \p hStream via a call to ::cuStreamBeginCapture. + * If capture was invalidated, due to a violation of the rules of stream capture, then + * a NULL graph will be returned. + * + * If the \p mode argument to ::cuStreamBeginCapture was not + * ::CU_STREAM_CAPTURE_MODE_RELAXED, this call must be from the same thread as + * ::cuStreamBeginCapture. + * + * \param hStream - Stream to query + * \param phGraph - The captured graph + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD + * \notefnerr + * + * \sa + * ::cuStreamCreate, + * ::cuStreamBeginCapture, + * ::cuStreamIsCapturing + */ +CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph); + +/** + * \brief Returns a stream's capture status + * + * Return the capture status of \p hStream via \p captureStatus. After a successful + * call, \p *captureStatus will contain one of the following: + * - ::CU_STREAM_CAPTURE_STATUS_NONE: The stream is not capturing. + * - ::CU_STREAM_CAPTURE_STATUS_ACTIVE: The stream is capturing. + * - ::CU_STREAM_CAPTURE_STATUS_INVALIDATED: The stream was capturing but an error + * has invalidated the capture sequence. The capture sequence must be terminated + * with ::cuStreamEndCapture on the stream where it was initiated in order to + * continue using \p hStream. + * + * Note that, if this is called on ::CU_STREAM_LEGACY (the "null stream") while + * a blocking stream in the same context is capturing, it will return + * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT and \p *captureStatus is unspecified + * after the call. The blocking stream capture is not invalidated. + * + * When a blocking stream is capturing, the legacy stream is in an + * unusable state until the blocking stream capture is terminated. The legacy + * stream is not supported for stream capture, but attempted use would have an + * implicit dependency on the capturing stream(s). + * + * \param hStream - Stream to query + * \param captureStatus - Returns the stream's capture status + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT + * \notefnerr + * + * \sa + * ::cuStreamCreate, + * ::cuStreamBeginCapture, + * ::cuStreamEndCapture + */ +CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus); + +#endif /* __CUDA_API_VERSION >= 10000 */ + +#if __CUDA_API_VERSION >= 10010 + +/** + * \brief Query capture status of a stream + * + * Query the capture status of a stream and and get an id for + * the capture sequence, which is unique over the lifetime of the process. + * + * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created + * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT. + * + * A valid id is returned only if both of the following are true: + * - the call returns CUDA_SUCCESS + * - captureStatus is set to ::CU_STREAM_CAPTURE_STATUS_ACTIVE + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT + * \notefnerr + * + * \sa + * ::cuStreamBeginCapture, + * ::cuStreamIsCapturing + */ + CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus, cuuint64_t *id); + +#endif /* __CUDA_API_VERSION >= 10010 */ + +#if __CUDA_API_VERSION >= 6000 + +/** + * \brief Attach memory to a stream asynchronously + * + * Enqueues an operation in \p hStream to specify stream association of + * \p length bytes of memory starting from \p dptr. This function is a + * stream-ordered operation, meaning that it is dependent on, and will + * only take effect when, previous work in stream has completed. Any + * previous association is automatically replaced. + * + * \p dptr must point to one of the following types of memories: + * - managed memory declared using the __managed__ keyword or allocated with + * ::cuMemAllocManaged. + * - a valid host-accessible region of system-allocated pageable memory. This + * type of memory may only be specified if the device associated with the + * stream reports a non-zero value for the device attribute + * ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + * + * For managed allocations, \p length must be either zero or the entire + * allocation's size. Both indicate that the entire allocation's stream + * association is being changed. Currently, it is not possible to change stream + * association for a portion of a managed allocation. + * + * For pageable host allocations, \p length must be non-zero. + * + * The stream association is specified using \p flags which must be + * one of ::CUmemAttach_flags. + * If the ::CU_MEM_ATTACH_GLOBAL flag is specified, the memory can be accessed + * by any stream on any device. + * If the ::CU_MEM_ATTACH_HOST flag is specified, the program makes a guarantee + * that it won't access the memory on the device from any stream on a device that + * has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + * If the ::CU_MEM_ATTACH_SINGLE flag is specified and \p hStream is associated with + * a device that has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, + * the program makes a guarantee that it will only access the memory on the device + * from \p hStream. It is illegal to attach singly to the NULL stream, because the + * NULL stream is a virtual global stream and not a specific stream. An error will + * be returned in this case. + * + * When memory is associated with a single stream, the Unified Memory system will + * allow CPU access to this memory region so long as all operations in \p hStream + * have completed, regardless of whether other streams are active. In effect, + * this constrains exclusive ownership of the managed memory region by + * an active GPU to per-stream activity instead of whole-GPU activity. + * + * Accessing memory on the device from streams that are not associated with + * it will produce undefined results. No error checking is performed by the + * Unified Memory system to ensure that kernels launched into other streams + * do not access this region. + * + * It is a program's responsibility to order calls to ::cuStreamAttachMemAsync + * via events, synchronization or other means to ensure legal access to memory + * at all times. Data visibility and coherency will be changed appropriately + * for all kernels which follow a stream-association change. + * + * If \p hStream is destroyed while data is associated with it, the association is + * removed and the association reverts to the default visibility of the allocation + * as specified at ::cuMemAllocManaged. For __managed__ variables, the default + * association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a stream is an + * asynchronous operation, and as a result, the change to default association won't + * happen until all work in the stream has completed. + * + * \param hStream - Stream in which to enqueue the attach operation + * \param dptr - Pointer to memory (must be a pointer to managed memory or + * to a valid host-accessible region of system-allocated + * pageable memory) + * \param length - Length of memory + * \param flags - Must be one of ::CUmemAttach_flags + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamWaitEvent, + * ::cuStreamDestroy, + * ::cuMemAllocManaged, + * ::cudaStreamAttachMemAsync + */ +CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags); + +#endif /* __CUDA_API_VERSION >= 6000 */ + +/** + * \brief Determine status of a compute stream + * + * Returns ::CUDA_SUCCESS if all operations in the stream specified by + * \p hStream have completed, or ::CUDA_ERROR_NOT_READY if not. + * + * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS + * is equivalent to having called ::cuStreamSynchronize(). + * + * \param hStream - Stream to query status of + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_READY + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamWaitEvent, + * ::cuStreamDestroy, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback, + * ::cudaStreamQuery + */ +CUresult CUDAAPI cuStreamQuery(CUstream hStream); + +/** + * \brief Wait until a stream's tasks are completed + * + * Waits until the device has completed all operations in the stream specified + * by \p hStream. If the context was created with the + * ::CU_CTX_SCHED_BLOCKING_SYNC flag, the CPU thread will block until the + * stream is finished with all of its tasks. + * + * \param hStream - Stream to wait for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE + + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamDestroy, + * ::cuStreamWaitEvent, + * ::cuStreamQuery, + * ::cuStreamAddCallback, + * ::cudaStreamSynchronize + */ +CUresult CUDAAPI cuStreamSynchronize(CUstream hStream); + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Destroys a stream + * + * Destroys the stream specified by \p hStream. + * + * In case the device is still doing work in the stream \p hStream + * when ::cuStreamDestroy() is called, the function will return immediately + * and the resources associated with \p hStream will be released automatically + * once the device has completed all work in \p hStream. + * + * \param hStream - Stream to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamWaitEvent, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback, + * ::cudaStreamDestroy + */ +CUresult CUDAAPI cuStreamDestroy(CUstream hStream); +#endif /* __CUDA_API_VERSION >= 4000 */ + +/** @} */ /* END CUDA_STREAM */ + + +/** + * \defgroup CUDA_EVENT Event Management + * + * ___MANBRIEF___ event management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the event management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Creates an event + * + * Creates an event *phEvent for the current context with the flags specified via + * \p Flags. Valid flags include: + * - ::CU_EVENT_DEFAULT: Default event creation flag. + * - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking + * synchronization. A CPU thread that uses ::cuEventSynchronize() to wait on + * an event created with this flag will block until the event has actually + * been recorded. + * - ::CU_EVENT_DISABLE_TIMING: Specifies that the created event does not need + * to record timing data. Events created with this flag specified and + * the ::CU_EVENT_BLOCKING_SYNC flag not specified will provide the best + * performance when used with ::cuStreamWaitEvent() and ::cuEventQuery(). + * - ::CU_EVENT_INTERPROCESS: Specifies that the created event may be used as an + * interprocess event by ::cuIpcGetEventHandle(). ::CU_EVENT_INTERPROCESS must + * be specified along with ::CU_EVENT_DISABLE_TIMING. + * + * \param phEvent - Returns newly created event + * \param Flags - Event creation flags + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa + * ::cuEventRecord, + * ::cuEventQuery, + * ::cuEventSynchronize, + * ::cuEventDestroy, + * ::cuEventElapsedTime, + * ::cudaEventCreate, + * ::cudaEventCreateWithFlags + */ +CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags); + +/** + * \brief Records an event + * + * Captures in \p hEvent the contents of \p hStream at the time of this call. + * \p hEvent and \p hStream must be from the same context. + * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then + * examine or wait for completion of the work that was captured. Uses of + * \p hStream after this call do not modify \p hEvent. See note on default + * stream behavior for what is captured in the default case. + * + * ::cuEventRecord() can be called multiple times on the same event and + * will overwrite the previously captured state. Other APIs such as + * ::cuStreamWaitEvent() use the most recently captured state at the time + * of the API call, and are not affected by later calls to + * ::cuEventRecord(). Before the first call to ::cuEventRecord(), an + * event represents an empty set of work, so for example ::cuEventQuery() + * would return ::CUDA_SUCCESS. + * + * \param hEvent - Event to record + * \param hStream - Stream to record event for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \note_null_stream + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventQuery, + * ::cuEventSynchronize, + * ::cuStreamWaitEvent, + * ::cuEventDestroy, + * ::cuEventElapsedTime, + * ::cudaEventRecord + */ +CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream); + +/** + * \brief Queries an event's status + * + * Queries the status of all work currently captured by \p hEvent. See + * ::cuEventRecord() for details on what is captured by an event. + * + * Returns ::CUDA_SUCCESS if all captured work has been completed, or + * ::CUDA_ERROR_NOT_READY if any captured work is incomplete. + * + * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS + * is equivalent to having called ::cuEventSynchronize(). + * + * \param hEvent - Event to query + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_READY + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventRecord, + * ::cuEventSynchronize, + * ::cuEventDestroy, + * ::cuEventElapsedTime, + * ::cudaEventQuery + */ +CUresult CUDAAPI cuEventQuery(CUevent hEvent); + +/** + * \brief Waits for an event to complete + * + * Waits until the completion of all work currently captured in \p hEvent. + * See ::cuEventRecord() for details on what is captured by an event. + * + * Waiting for an event that was created with the ::CU_EVENT_BLOCKING_SYNC + * flag will cause the calling CPU thread to block until the event has + * been completed by the device. If the ::CU_EVENT_BLOCKING_SYNC flag has + * not been set, then the CPU thread will busy-wait until the event has + * been completed by the device. + * + * \param hEvent - Event to wait for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventRecord, + * ::cuEventQuery, + * ::cuEventDestroy, + * ::cuEventElapsedTime, + * ::cudaEventSynchronize + */ +CUresult CUDAAPI cuEventSynchronize(CUevent hEvent); + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Destroys an event + * + * Destroys the event specified by \p hEvent. + * + * An event may be destroyed before it is complete (i.e., while + * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY). In this case, the + * call does not block on completion of the event, and any associated + * resources will automatically be released asynchronously at completion. + * + * \param hEvent - Event to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventRecord, + * ::cuEventQuery, + * ::cuEventSynchronize, + * ::cuEventElapsedTime, + * ::cudaEventDestroy + */ +CUresult CUDAAPI cuEventDestroy(CUevent hEvent); +#endif /* __CUDA_API_VERSION >= 4000 */ + +/** + * \brief Computes the elapsed time between two events + * + * Computes the elapsed time between two events (in milliseconds with a + * resolution of around 0.5 microseconds). + * + * If either event was last recorded in a non-NULL stream, the resulting time + * may be greater than expected (even if both used the same stream handle). This + * happens because the ::cuEventRecord() operation takes place asynchronously + * and there is no guarantee that the measured latency is actually just between + * the two events. Any number of other different stream operations could execute + * in between the two measured events, thus altering the timing in a significant + * way. + * + * If ::cuEventRecord() has not been called on either event then + * ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called + * on both events but one or both of them has not yet been completed (that is, + * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the + * events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with + * the ::CU_EVENT_DISABLE_TIMING flag, then this function will return + * ::CUDA_ERROR_INVALID_HANDLE. + * + * \param pMilliseconds - Time between \p hStart and \p hEnd in ms + * \param hStart - Starting event + * \param hEnd - Ending event + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_READY + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventRecord, + * ::cuEventQuery, + * ::cuEventSynchronize, + * ::cuEventDestroy, + * ::cudaEventElapsedTime + */ +CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd); + +/** @} */ /* END CUDA_EVENT */ + +/** + * \defgroup CUDA_EXTRES_INTEROP External Resource Interoperability + * + * ___MANBRIEF___ External resource interoperability functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the external resource interoperability functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +#if __CUDA_API_VERSION >= 10000 + + /** + * \brief Imports an external memory object + * + * Imports an externally allocated memory object and returns + * a handle to that in \p extMem_out. + * + * The properties of the handle being imported must be described in + * \p memHandleDesc. The ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC structure + * is defined as follows: + * + * \code + typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st { + CUexternalMemoryHandleType type; + union { + int fd; + struct { + void *handle; + const void *name; + } win32; + } handle; + unsigned long long size; + unsigned int flags; + } CUDA_EXTERNAL_MEMORY_HANDLE_DESC; + * \endcode + * + * where ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type specifies the type + * of handle being imported. ::CUexternalMemoryHandleType is + * defined as: + * + * \code + typedef enum CUexternalMemoryHandleType_enum { + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 = 2, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP = 4, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE = 5 + } CUexternalMemoryHandleType; + * \endcode + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD, then + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::fd must be a valid + * file descriptor referencing a memory object. Ownership of + * the file descriptor is transferred to the CUDA driver when the + * handle is imported successfully. Performing any operations on the + * file descriptor after it is imported results in undefined behavior. + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32, then exactly one + * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be + * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * references a memory object. Ownership of this handle is + * not transferred to CUDA after the import operation, so the + * application must release the handle using the appropriate system + * call. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name + * is not NULL, then it must point to a NULL-terminated array of + * UTF-16 characters that refers to a memory object. + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT, then + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must + * be non-NULL and + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name + * must be NULL. The handle specified must be a globally shared KMT + * handle. This handle does not hold a reference to the underlying + * object, and thus will be invalid when all references to the + * memory object are destroyed. + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP, then exactly one + * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be + * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * is returned by ID3DDevice::CreateSharedHandle when referring to a + * ID3D12Heap object. This handle holds a reference to the underlying + * object. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name + * is not NULL, then it must point to a NULL-terminated array of + * UTF-16 characters that refers to a ID3D12Heap object. + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE, then exactly one + * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be + * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * is returned by ID3DDevice::CreateSharedHandle when referring to a + * ID3D12Resource object. This handle holds a reference to the + * underlying object. If + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name + * is not NULL, then it must point to a NULL-terminated array of + * UTF-16 characters that refers to a ID3D12Resource object. + * + * The size of the memory object must be specified in + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::size. + * + * Specifying the flag ::CUDA_EXTERNAL_MEMORY_DEDICATED in + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::flags indicates that the + * resource is a dedicated resource. The definition of what a + * dedicated resource is outside the scope of this extension. + * + * \param extMem_out - Returned handle to an external memory object + * \param memHandleDesc - Memory import handle descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \note If the Vulkan memory imported into CUDA is mapped on the CPU then the + * application must use vkInvalidateMappedMemoryRanges/vkFlushMappedMemoryRanges + * as well as appropriate Vulkan pipeline barriers to maintain coherence between + * CPU and GPU. For more information on these APIs, please refer to "Synchronization + * and Cache Control" chapter from Vulkan specification. + * + * \sa ::cuDestroyExternalMemory, + * ::cuExternalMemoryGetMappedBuffer, + * ::cuExternalMemoryGetMappedMipmappedArray + */ +CUresult CUDAAPI cuImportExternalMemory(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc); + +/** + * \brief Maps a buffer onto an imported memory object + * + * Maps a buffer onto an imported memory object and returns a device + * pointer in \p devPtr. + * + * The properties of the buffer being mapped must be described in + * \p bufferDesc. The ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC structure is + * defined as follows: + * + * \code + typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st { + unsigned long long offset; + unsigned long long size; + unsigned int flags; + } CUDA_EXTERNAL_MEMORY_BUFFER_DESC; + * \endcode + * + * where ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::offset is the offset in + * the memory object where the buffer's base address is. + * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::size is the size of the buffer. + * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::flags must be zero. + * + * The offset and size have to be suitably aligned to match the + * requirements of the external API. Mapping two buffers whose ranges + * overlap may or may not result in the same virtual address being + * returned for the overlapped portion. In such cases, the application + * must ensure that all accesses to that region from the GPU are + * volatile. Otherwise writes made via one address are not guaranteed + * to be visible via the other address, even if they're issued by the + * same thread. It is recommended that applications map the combined + * range instead of mapping separate buffers and then apply the + * appropriate offsets to the returned pointer to derive the + * individual buffers. + * + * The returned pointer \p devPtr must be freed using ::cuMemFree. + * + * \param devPtr - Returned device pointer to buffer + * \param extMem - Handle to external memory object + * \param bufferDesc - Buffer descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuImportExternalMemory + * ::cuDestroyExternalMemory, + * ::cuExternalMemoryGetMappedMipmappedArray + */ +CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc); + +/** + * \brief Maps a CUDA mipmapped array onto an external memory object + * + * Maps a CUDA mipmapped array onto an external object and returns a + * handle to it in \p mipmap. + * + * The properties of the CUDA mipmapped array being mapped must be + * described in \p mipmapDesc. The structure + * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC is defined as follows: + * + * \code + typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st { + unsigned long long offset; + CUDA_ARRAY3D_DESCRIPTOR arrayDesc; + unsigned int numLevels; + } CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC; + * \endcode + * + * where ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::offset is the + * offset in the memory object where the base level of the mipmap + * chain is. + * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc describes + * the format, dimensions and type of the base level of the mipmap + * chain. For further details on these parameters, please refer to the + * documentation for ::cuMipmappedArrayCreate. Note that if the mipmapped + * array is bound as a color target in the graphics API, then the flag + * ::CUDA_ARRAY3D_COLOR_ATTACHMENT must be specified in + * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc::Flags. + * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels specifies + * the total number of levels in the mipmap chain. + * + * The returned CUDA mipmapped array must be freed using ::cuMipmappedArrayDestroy. + * + * \param mipmap - Returned CUDA mipmapped array + * \param extMem - Handle to external memory object + * \param mipmapDesc - CUDA array descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuImportExternalMemory + * ::cuDestroyExternalMemory, + * ::cuExternalMemoryGetMappedBuffer + */ +CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc); + +/** + * \brief Destroys an external memory object. + * + * Destroys the specified external memory object. Any existing buffers + * and CUDA mipmapped arrays mapped onto this object must no longer be + * used and must be explicitly freed using ::cuMemFree and + * ::cuMipmappedArrayDestroy respectively. + * + * \param extMem - External memory object to be destroyed + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuImportExternalMemory + * ::cuExternalMemoryGetMappedBuffer, + * ::cuExternalMemoryGetMappedMipmappedArray + */ +CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem); + +/** + * \brief Imports an external semaphore + * + * Imports an externally allocated synchronization object and returns + * a handle to that in \p extSem_out. + * + * The properties of the handle being imported must be described in + * \p semHandleDesc. The ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC is + * defined as follows: + * + * \code + typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st { + CUexternalSemaphoreHandleType type; + union { + int fd; + struct { + void *handle; + const void *name; + } win32; + } handle; + unsigned int flags; + } CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC; + * \endcode + * + * where ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type specifies the type of + * handle being imported. ::CUexternalSemaphoreHandleType is defined + * as: + * + * \code + typedef enum CUexternalSemaphoreHandleType_enum { + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD = 1, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 = 2, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE = 4 + } CUexternalSemaphoreHandleType; + * \endcode + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, then + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid + * file descriptor referencing a synchronization object. Ownership of + * the file descriptor is transferred to the CUDA driver when the + * handle is imported successfully. Performing any operations on the + * file descriptor after it is imported results in undefined behavior. + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, then exactly one + * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be + * NULL. If + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * references a synchronization object. Ownership of this handle is + * not transferred to CUDA after the import operation, so the + * application must release the handle using the appropriate system + * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name + * is not NULL, then it must name a valid synchronization object. + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT, then + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle must + * be non-NULL and + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name + * must be NULL. The handle specified must be a globally shared KMT + * handle. This handle does not hold a reference to the underlying + * object, and thus will be invalid when all references to the + * synchronization object are destroyed. + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then exactly one + * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be + * NULL. If + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * is returned by ID3DDevice::CreateSharedHandle when referring to a + * ID3D12Fence object. This handle holds a reference to the underlying + * object. If + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name + * is not NULL, then it must name a valid synchronization object that + * refers to a valid ID3D12Fence object. + * + * \param extSem_out - Returned handle to an external semaphore + * \param semHandleDesc - Semaphore import handle descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuDestroyExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuImportExternalSemaphore(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc); + +/** + * \brief Signals a set of external semaphore objects + * + * Enqueues a signal operation on a set of externally allocated + * semaphore object in the specified stream. The operations will be + * executed when all prior operations in the stream complete. + * + * The exact semantics of signaling a semaphore depends on the type of + * the object. + * + * If the semaphore object is any one of the following types: + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT + * then signaling the semaphore will set it to the signaled state. + * + * If the semaphore object is of the type + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then the + * semaphore will be set to the value specified in + * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::fence::value. + * + * \param extSemArray - Set of external semaphores to be signaled + * \param paramsArray - Array of semaphore parameters + * \param numExtSems - Number of semaphores to signal + * \param stream - Stream to enqueue the signal operations in + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuImportExternalSemaphore, + * ::cuDestroyExternalSemaphore, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); + +/** + * \brief Waits on a set of external semaphore objects + * + * Enqueues a wait operation on a set of externally allocated + * semaphore object in the specified stream. The operations will be + * executed when all prior operations in the stream complete. + * + * The exact semantics of waiting on a semaphore depends on the type + * of the object. + * + * If the semaphore object is any one of the following types: + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT + * then waiting on the semaphore will wait until the semaphore reaches + * the signaled state. The semaphore will then be reset to the + * unsignaled state. Therefore for every signal operation, there can + * only be one wait operation. + * + * If the semaphore object is of the type + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then waiting on + * the semaphore will wait until the value of the semaphore is + * greater than or equal to + * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::fence::value. + * + * \param extSemArray - External semaphores to be waited on + * \param paramsArray - Array of semaphore parameters + * \param numExtSems - Number of semaphores to wait on + * \param stream - Stream to enqueue the wait operations in + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuImportExternalSemaphore, + * ::cuDestroyExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync + */ +CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); + +/** + * \brief Destroys an external semaphore + * + * Destroys an external semaphore object and releases any references + * to the underlying resource. Any outstanding signals or waits must + * have completed before the semaphore is destroyed. + * + * \param extSem - External semaphore to be destroyed + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuImportExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem); + +#endif /* __CUDA_API_VERSION >= 10000 */ + +/** @} */ /* END CUDA_EXTRES_INTEROP */ + +/** + * \defgroup CUDA_MEMOP Stream memory operations + * + * ___MANBRIEF___ Stream memory operations of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the stream memory operations of the low-level CUDA + * driver application programming interface. + * + * The whole set of operations is disabled by default. Users are required + * to explicitly enable them, e.g. on Linux by passing the kernel module + * parameter shown below: + * modprobe nvidia NVreg_EnableStreamMemOPs=1 + * There is currently no way to enable these operations on other operating + * systems. + * + * Users can programmatically query whether the device supports these + * operations with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. + * + * Support for the ::CU_STREAM_WAIT_VALUE_NOR flag can be queried with + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR. + * + * Support for the ::cuStreamWriteValue64() and ::cuStreamWaitValue64() + * functions, as well as for the ::CU_STREAM_MEM_OP_WAIT_VALUE_64 and + * ::CU_STREAM_MEM_OP_WRITE_VALUE_64 flags, can be queried with + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. + * + * Support for both ::CU_STREAM_WAIT_VALUE_FLUSH and + * ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES requires dedicated platform + * hardware features and can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES. + * + * Note that all memory pointers passed as parameters to these operations + * are device pointers. Where necessary a device pointer should be + * obtained, for example with ::cuMemHostGetDevicePointer(). + * + * None of the operations accepts pointers to managed memory buffers + * (::cuMemAllocManaged). + * + * @{ + */ + +#if __CUDA_API_VERSION >= 8000 +/** + * \brief Wait on a memory location + * + * Enqueues a synchronization of the stream on the given memory location. Work + * ordered after the operation will block until the given condition on the + * memory is satisfied. By default, the condition is to wait for + * (int32_t)(*addr - value) >= 0, a cyclic greater-or-equal. + * Other condition types can be specified via \p flags. + * + * If the memory was registered via ::cuMemHostRegister(), the device pointer + * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot + * be used with managed memory (::cuMemAllocManaged). + * + * Support for this can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. + * + * Support for CU_STREAM_WAIT_VALUE_NOR can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR. + * + * \param stream The stream to synchronize on the memory location. + * \param addr The memory location to wait on. + * \param value The value to compare with the memory location. + * \param flags See ::CUstreamWaitValue_flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuStreamWaitValue64, + * ::cuStreamWriteValue32, + * ::cuStreamWriteValue64 + * ::cuStreamBatchMemOp, + * ::cuMemHostRegister, + * ::cuStreamWaitEvent + */ +CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); + +/** + * \brief Wait on a memory location + * + * Enqueues a synchronization of the stream on the given memory location. Work + * ordered after the operation will block until the given condition on the + * memory is satisfied. By default, the condition is to wait for + * (int64_t)(*addr - value) >= 0, a cyclic greater-or-equal. + * Other condition types can be specified via \p flags. + * + * If the memory was registered via ::cuMemHostRegister(), the device pointer + * should be obtained with ::cuMemHostGetDevicePointer(). + * + * Support for this can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. + * + * \param stream The stream to synchronize on the memory location. + * \param addr The memory location to wait on. + * \param value The value to compare with the memory location. + * \param flags See ::CUstreamWaitValue_flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuStreamWaitValue32, + * ::cuStreamWriteValue32, + * ::cuStreamWriteValue64, + * ::cuStreamBatchMemOp, + * ::cuMemHostRegister, + * ::cuStreamWaitEvent + */ +CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); + +/** + * \brief Write a value to memory + * + * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER + * flag is passed, the write is preceded by a system-wide memory fence, + * equivalent to a __threadfence_system() but scoped to the stream + * rather than a CUDA thread. + * + * If the memory was registered via ::cuMemHostRegister(), the device pointer + * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot + * be used with managed memory (::cuMemAllocManaged). + * + * Support for this can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. + * + * \param stream The stream to do the write in. + * \param addr The device address to write to. + * \param value The value to write. + * \param flags See ::CUstreamWriteValue_flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuStreamWriteValue64, + * ::cuStreamWaitValue32, + * ::cuStreamWaitValue64, + * ::cuStreamBatchMemOp, + * ::cuMemHostRegister, + * ::cuEventRecord + */ +CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); + +/** + * \brief Write a value to memory + * + * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER + * flag is passed, the write is preceded by a system-wide memory fence, + * equivalent to a __threadfence_system() but scoped to the stream + * rather than a CUDA thread. + * + * If the memory was registered via ::cuMemHostRegister(), the device pointer + * should be obtained with ::cuMemHostGetDevicePointer(). + * + * Support for this can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. + * + * \param stream The stream to do the write in. + * \param addr The device address to write to. + * \param value The value to write. + * \param flags See ::CUstreamWriteValue_flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuStreamWriteValue32, + * ::cuStreamWaitValue32, + * ::cuStreamWaitValue64, + * ::cuStreamBatchMemOp, + * ::cuMemHostRegister, + * ::cuEventRecord + */ +CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); + +/** + * \brief Batch operations to synchronize the stream via memory operations + * + * This is a batch version of ::cuStreamWaitValue32() and ::cuStreamWriteValue32(). + * Batching operations may avoid some performance overhead in both the API call + * and the device execution versus adding them to the stream in separate API + * calls. The operations are enqueued in the order they appear in the array. + * + * See ::CUstreamBatchMemOpType for the full set of supported operations, and + * ::cuStreamWaitValue32(), ::cuStreamWaitValue64(), ::cuStreamWriteValue32(), + * and ::cuStreamWriteValue64() for details of specific operations. + * + * Basic support for this can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. See related APIs for details + * on querying support for specific operations. + * + * \param stream The stream to enqueue the operations in. + * \param count The number of operations in the array. Must be less than 256. + * \param paramArray The types and parameters of the individual operations. + * \param flags Reserved for future expansion; must be 0. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuStreamWaitValue32, + * ::cuStreamWaitValue64, + * ::cuStreamWriteValue32, + * ::cuStreamWriteValue64, + * ::cuMemHostRegister + */ +CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); +#endif /* __CUDA_API_VERSION >= 8000 */ + +/** @} */ /* END CUDA_MEMOP */ + +/** + * \defgroup CUDA_EXEC Execution Control + * + * ___MANBRIEF___ execution control functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the execution control functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns information about a function + * + * Returns in \p *pi the integer value of the attribute \p attrib on the kernel + * given by \p hfunc. The supported attributes are: + * - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads + * per block, beyond which a launch of the function would fail. This number + * depends on both the function and the device on which the function is + * currently loaded. + * - ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of + * statically-allocated shared memory per block required by this function. + * This does not include dynamically-allocated shared memory requested by + * the user at runtime. + * - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of user-allocated + * constant memory required by this function. + * - ::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of local memory + * used by each thread of this function. + * - ::CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each thread + * of this function. + * - ::CU_FUNC_ATTRIBUTE_PTX_VERSION: The PTX virtual architecture version for + * which the function was compiled. This value is the major PTX version * 10 + * + the minor PTX version, so a PTX version 1.3 function would return the + * value 13. Note that this may return the undefined value of 0 for cubins + * compiled prior to CUDA 3.0. + * - ::CU_FUNC_ATTRIBUTE_BINARY_VERSION: The binary architecture version for + * which the function was compiled. This value is the major binary + * version * 10 + the minor binary version, so a binary version 1.3 function + * would return the value 13. Note that this will return a value of 10 for + * legacy cubins that do not have a properly-encoded binary architecture + * version. + * - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the function has + * been compiled with user specified option "-Xptxas --dlcm=ca" set . + * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: The maximum size in bytes of + * dynamically-allocated shared memory. + * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared memory-L1 + * cache split ratio in percent of total shared memory. + * + * \param pi - Returned attribute value + * \param attrib - Attribute requested + * \param hfunc - Function to query attribute of + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncSetCacheConfig, + * ::cuLaunchKernel, + * ::cudaFuncGetAttributes + * ::cudaFuncSetAttribute + */ +CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc); + +#if __CUDA_API_VERSION >= 9000 + +/** + * \brief Sets information about a function + * + * This call sets the value of a specified attribute \p attrib on the kernel given + * by \p hfunc to an integer value specified by \p val + * This function returns CUDA_SUCCESS if the new value of the attribute could be + * successfully set. If the set fails, this call will return an error. + * Not all attributes can have values set. Attempting to set a value on a read-only + * attribute will result in an error (CUDA_ERROR_INVALID_VALUE) + * + * Supported attributes for the cuFuncSetAttribute call are: + * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: This maximum size in bytes of + * dynamically-allocated shared memory. The value should contain the requested + * maximum size of dynamically-allocated shared memory. The sum of this value and + * the function attribute ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed the + * device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN. + * The maximal size of requestable dynamic shared memory may differ by GPU + * architecture. + * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1 + * cache and shared memory use the same hardware resources, this sets the shared memory + * carveout preference, in percent of the total shared memory. + * See ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR + * This is only a hint, and the driver can choose a different ratio if required to execute the function. + * + * \param hfunc - Function to query attribute of + * \param attrib - Attribute requested + * \param value - The value to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncSetCacheConfig, + * ::cuLaunchKernel, + * ::cudaFuncGetAttributes + * ::cudaFuncSetAttribute + */ +CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value); +#endif // __CUDA_API_VERSION >= 9000 + +/** + * \brief Sets the preferred cache configuration for a device function + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this sets through \p config the preferred cache configuration for + * the device function \p hfunc. This is only a preference. The driver will use + * the requested configuration if possible, but it is free to choose a different + * configuration if required to execute \p hfunc. Any context-wide preference + * set via ::cuCtxSetCacheConfig() will be overridden by this per-function + * setting unless the per-function setting is ::CU_FUNC_CACHE_PREFER_NONE. In + * that case, the current context-wide setting will be used. + * + * This setting does nothing on devices where the size of the L1 cache and + * shared memory are fixed. + * + * Launching a kernel with a different preference than the most recent + * preference setting may insert a device-side synchronization point. + * + * + * The supported cache configurations are: + * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) + * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache + * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory + * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory + * + * \param hfunc - Kernel to configure cache for + * \param config - Requested cache configuration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cuLaunchKernel, + * ::cudaFuncSetCacheConfig + */ +CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config); + +#if __CUDA_API_VERSION >= 4020 +/** + * \brief Sets the shared memory configuration for a device function. + * + * On devices with configurable shared memory banks, this function will + * force all subsequent launches of the specified device function to have + * the given shared memory bank size configuration. On any given launch of the + * function, the shared memory configuration of the device will be temporarily + * changed if needed to suit the function's preferred configuration. Changes in + * shared memory configuration between subsequent launches of functions, + * may introduce a device side synchronization point. + * + * Any per-function setting of shared memory bank size set via + * ::cuFuncSetSharedMemConfig will override the context wide setting set with + * ::cuCtxSetSharedMemConfig. + * + * Changing the shared memory bank size will not increase shared memory usage + * or affect occupancy of kernels, but may have major effects on performance. + * Larger bank sizes will allow for greater potential bandwidth to shared memory, + * but will change what kinds of accesses to shared memory will result in bank + * conflicts. + * + * This function will do nothing on devices with fixed shared memory bank size. + * + * The supported bank configurations are: + * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: use the context's shared memory + * configuration when launching this function. + * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to + * be natively four bytes when launching this function. + * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to + * be natively eight bytes when launching this function. + * + * \param hfunc - kernel to be given a shared memory config + * \param config - requested shared memory configuration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuCtxGetSharedMemConfig, + * ::cuCtxSetSharedMemConfig, + * ::cuFuncGetAttribute, + * ::cuLaunchKernel, + * ::cudaFuncSetSharedMemConfig + */ +CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config); +#endif + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Launches a CUDA function + * + * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ + * grid of blocks. Each block contains \p blockDimX x \p blockDimY x + * \p blockDimZ threads. + * + * \p sharedMemBytes sets the amount of dynamic shared memory that will be + * available to each thread block. + * + * Kernel parameters to \p f can be specified in one of two ways: + * + * 1) Kernel parameters can be specified via \p kernelParams. If \p f + * has N parameters, then \p kernelParams needs to be an array of N + * pointers. Each of \p kernelParams[0] through \p kernelParams[N-1] + * must point to a region of memory from which the actual kernel + * parameter will be copied. The number of kernel parameters and their + * offsets and sizes do not need to be specified as that information is + * retrieved directly from the kernel's image. + * + * 2) Kernel parameters can also be packaged by the application into + * a single buffer that is passed in via the \p extra parameter. + * This places the burden on the application of knowing each kernel + * parameter's size and alignment/padding within the buffer. Here is + * an example of using the \p extra parameter in this manner: + * \code + size_t argBufferSize; + char argBuffer[256]; + + // populate argBuffer and argBufferSize + + void *config[] = { + CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, + CU_LAUNCH_PARAM_BUFFER_SIZE, &argBufferSize, + CU_LAUNCH_PARAM_END + }; + status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config); + * \endcode + * + * The \p extra parameter exists to allow ::cuLaunchKernel to take + * additional less commonly used arguments. \p extra specifies a list of + * names of extra settings and their corresponding values. Each extra + * setting name is immediately followed by the corresponding value. The + * list must be terminated with either NULL or ::CU_LAUNCH_PARAM_END. + * + * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra + * array; + * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next + * value in \p extra will be a pointer to a buffer containing all + * the kernel parameters for launching kernel \p f; + * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next + * value in \p extra will be a pointer to a size_t containing the + * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER; + * + * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel + * parameters are specified with both \p kernelParams and \p extra + * (i.e. both \p kernelParams and \p extra are non-NULL). + * + * Calling ::cuLaunchKernel() sets persistent function state that is + * the same as function state set through the following deprecated APIs: + * ::cuFuncSetBlockShape(), + * ::cuFuncSetSharedSize(), + * ::cuParamSetSize(), + * ::cuParamSeti(), + * ::cuParamSetf(), + * ::cuParamSetv(). + * + * When the kernel \p f is launched via ::cuLaunchKernel(), the previous + * block shape, shared size and parameter info associated with \p f + * is overwritten. + * + * Note that to use ::cuLaunchKernel(), the kernel \p f must either have + * been compiled with toolchain version 3.2 or later so that it will + * contain kernel parameter information, or have no kernel parameters. + * If either of these conditions is not met, then ::cuLaunchKernel() will + * return ::CUDA_ERROR_INVALID_IMAGE. + * + * \param f - Kernel to launch + * \param gridDimX - Width of grid in blocks + * \param gridDimY - Height of grid in blocks + * \param gridDimZ - Depth of grid in blocks + * \param blockDimX - X dimension of each thread block + * \param blockDimY - Y dimension of each thread block + * \param blockDimZ - Z dimension of each thread block + * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes + * \param hStream - Stream identifier + * \param kernelParams - Array of pointers to kernel parameters + * \param extra - Extra options + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_IMAGE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * \note_null_stream + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cudaLaunchKernel + */ +CUresult CUDAAPI cuLaunchKernel(CUfunction f, + unsigned int gridDimX, + unsigned int gridDimY, + unsigned int gridDimZ, + unsigned int blockDimX, + unsigned int blockDimY, + unsigned int blockDimZ, + unsigned int sharedMemBytes, + CUstream hStream, + void **kernelParams, + void **extra); +#endif /* __CUDA_API_VERSION >= 4000 */ +#if __CUDA_API_VERSION >= 9000 +/** + * \brief Launches a CUDA function where thread blocks can cooperate and synchronize as they execute + * + * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ + * grid of blocks. Each block contains \p blockDimX x \p blockDimY x + * \p blockDimZ threads. + * + * \p sharedMemBytes sets the amount of dynamic shared memory that will be + * available to each thread block. + * + * The device on which this kernel is invoked must have a non-zero value for + * the device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH. + * + * The total number of blocks launched cannot exceed the maximum number of blocks per + * multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or + * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors + * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. + * + * The kernel cannot make use of CUDA dynamic parallelism. + * + * Kernel parameters must be specified via \p kernelParams. If \p f + * has N parameters, then \p kernelParams needs to be an array of N + * pointers. Each of \p kernelParams[0] through \p kernelParams[N-1] + * must point to a region of memory from which the actual kernel + * parameter will be copied. The number of kernel parameters and their + * offsets and sizes do not need to be specified as that information is + * retrieved directly from the kernel's image. + * + * Calling ::cuLaunchCooperativeKernel() sets persistent function state that is + * the same as function state set through ::cuLaunchKernel API + * + * When the kernel \p f is launched via ::cuLaunchCooperativeKernel(), the previous + * block shape, shared size and parameter info associated with \p f + * is overwritten. + * + * Note that to use ::cuLaunchCooperativeKernel(), the kernel \p f must either have + * been compiled with toolchain version 3.2 or later so that it will + * contain kernel parameter information, or have no kernel parameters. + * If either of these conditions is not met, then ::cuLaunchCooperativeKernel() will + * return ::CUDA_ERROR_INVALID_IMAGE. + * + * \param f - Kernel to launch + * \param gridDimX - Width of grid in blocks + * \param gridDimY - Height of grid in blocks + * \param gridDimZ - Depth of grid in blocks + * \param blockDimX - X dimension of each thread block + * \param blockDimY - Y dimension of each thread block + * \param blockDimZ - Z dimension of each thread block + * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes + * \param hStream - Stream identifier + * \param kernelParams - Array of pointers to kernel parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_IMAGE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * \note_null_stream + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cuLaunchCooperativeKernelMultiDevice, + * ::cudaLaunchCooperativeKernel + */ +CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, + unsigned int gridDimX, + unsigned int gridDimY, + unsigned int gridDimZ, + unsigned int blockDimX, + unsigned int blockDimY, + unsigned int blockDimZ, + unsigned int sharedMemBytes, + CUstream hStream, + void **kernelParams); + +/** + * \brief Launches CUDA functions on multiple devices where thread blocks can cooperate and synchronize as they execute + * + * Invokes kernels as specified in the \p launchParamsList array where each element + * of the array specifies all the parameters required to perform a single kernel launch. + * These kernels can cooperate and synchronize as they execute. The size of the array is + * specified by \p numDevices. + * + * No two kernels can be launched on the same device. All the devices targeted by this + * multi-device launch must be identical. All devices must have a non-zero value for the + * device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH. + * + * All kernels launched must be identical with respect to the compiled code. Note that + * any __device__, __constant__ or __managed__ variables present in the module that owns + * the kernel launched on each device, are independently instantiated on every device. + * It is the application's responsibility to ensure these variables are initialized and + * used appropriately. + * + * The size of the grids as specified in blocks, the size of the blocks themselves + * and the amount of shared memory used by each thread block must also match across + * all launched kernels. + * + * The streams used to launch these kernels must have been created via either ::cuStreamCreate + * or ::cuStreamCreateWithPriority. The NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD + * cannot be used. + * + * The total number of blocks launched per kernel cannot exceed the maximum number of blocks + * per multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or + * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors + * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. Since the + * total number of blocks launched per device has to match across all devices, the maximum + * number of blocks that can be launched per device will be limited by the device with the + * least number of multiprocessors. + * + * The kernels cannot make use of CUDA dynamic parallelism. + * + * The ::CUDA_LAUNCH_PARAMS structure is defined as: + * \code + typedef struct CUDA_LAUNCH_PARAMS_st + { + CUfunction function; + unsigned int gridDimX; + unsigned int gridDimY; + unsigned int gridDimZ; + unsigned int blockDimX; + unsigned int blockDimY; + unsigned int blockDimZ; + unsigned int sharedMemBytes; + CUstream hStream; + void **kernelParams; + } CUDA_LAUNCH_PARAMS; + * \endcode + * where: + * - ::CUDA_LAUNCH_PARAMS::function specifies the kernel to be launched. All functions must + * be identical with respect to the compiled code. + * - ::CUDA_LAUNCH_PARAMS::gridDimX is the width of the grid in blocks. This must match across + * all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::gridDimY is the height of the grid in blocks. This must match across + * all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::gridDimZ is the depth of the grid in blocks. This must match across + * all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::blockDimX is the X dimension of each thread block. This must match across + * all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::blockDimX is the Y dimension of each thread block. This must match across + * all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::blockDimZ is the Z dimension of each thread block. This must match across + * all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::sharedMemBytes is the dynamic shared-memory size per thread block in bytes. + * This must match across all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::hStream is the handle to the stream to perform the launch in. This cannot + * be the NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD. The CUDA context associated + * with this stream must match that associated with ::CUDA_LAUNCH_PARAMS::function. + * - ::CUDA_LAUNCH_PARAMS::kernelParams is an array of pointers to kernel parameters. If + * ::CUDA_LAUNCH_PARAMS::function has N parameters, then ::CUDA_LAUNCH_PARAMS::kernelParams + * needs to be an array of N pointers. Each of ::CUDA_LAUNCH_PARAMS::kernelParams[0] through + * ::CUDA_LAUNCH_PARAMS::kernelParams[N-1] must point to a region of memory from which the actual + * kernel parameter will be copied. The number of kernel parameters and their offsets and sizes + * do not need to be specified as that information is retrieved directly from the kernel's image. + * + * By default, the kernel won't begin execution on any GPU until all prior work in all the specified + * streams has completed. This behavior can be overridden by specifying the flag + * ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC. When this flag is specified, each kernel + * will only wait for prior work in the stream corresponding to that GPU to complete before it begins + * execution. + * + * Similarly, by default, any subsequent work pushed in any of the specified streams will not begin + * execution until the kernels on all GPUs have completed. This behavior can be overridden by specifying + * the flag ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC. When this flag is specified, + * any subsequent work pushed in any of the specified streams will only wait for the kernel launched + * on the GPU corresponding to that stream to complete before it begins execution. + * + * Calling ::cuLaunchCooperativeKernelMultiDevice() sets persistent function state that is + * the same as function state set through ::cuLaunchKernel API when called individually for each + * element in \p launchParamsList. + * + * When kernels are launched via ::cuLaunchCooperativeKernelMultiDevice(), the previous + * block shape, shared size and parameter info associated with each ::CUDA_LAUNCH_PARAMS::function + * in \p launchParamsList is overwritten. + * + * Note that to use ::cuLaunchCooperativeKernelMultiDevice(), the kernels must either have + * been compiled with toolchain version 3.2 or later so that it will + * contain kernel parameter information, or have no kernel parameters. + * If either of these conditions is not met, then ::cuLaunchCooperativeKernelMultiDevice() will + * return ::CUDA_ERROR_INVALID_IMAGE. + * + * \param launchParamsList - List of launch parameters, one per device + * \param numDevices - Size of the \p launchParamsList array + * \param flags - Flags to control launch behavior + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_IMAGE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * \note_null_stream + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cuLaunchCooperativeKernel, + * ::cudaLaunchCooperativeKernelMultiDevice + */ +CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices, unsigned int flags); + +#endif /* __CUDA_API_VERSION >= 9000 */ + +#if __CUDA_API_VERSION >= 10000 + +/** + * \brief Enqueues a host function call in a stream + * + * Enqueues a host function to run in a stream. The function will be called + * after currently enqueued work and will block work added after it. + * + * The host function must not make any CUDA API calls. Attempting to use a + * CUDA API may result in ::CUDA_ERROR_NOT_PERMITTED, but this is not required. + * The host function must not perform any synchronization that may depend on + * outstanding CUDA work not mandated to run earlier. Host functions without a + * mandated order (such as in independent streams) execute in undefined order + * and may be serialized. + * + * For the purposes of Unified Memory, execution makes a number of guarantees: + *
    + *
  • The stream is considered idle for the duration of the function's + * execution. Thus, for example, the function may always use memory attached + * to the stream it was enqueued in.
  • + *
  • The start of execution of the function has the same effect as + * synchronizing an event recorded in the same stream immediately prior to + * the function. It thus synchronizes streams which have been "joined" + * prior to the function.
  • + *
  • Adding device work to any stream does not have the effect of making + * the stream active until all preceding host functions and stream callbacks + * have executed. Thus, for + * example, a function might use global attached memory even if work has + * been added to another stream, if the work has been ordered behind the + * function call with an event.
  • + *
  • Completion of the function does not cause a stream to become + * active except as described above. The stream will remain idle + * if no device work follows the function, and will remain idle across + * consecutive host functions or stream callbacks without device work in + * between. Thus, for example, + * stream synchronization can be done by signaling from a host function at the + * end of the stream.
  • + *
+ * + * Note that, in contrast to ::cuStreamAddCallback, the function will not be + * called in the event of an error in the CUDA context. + * + * \param hStream - Stream to enqueue function call in + * \param fn - The function to call once preceding stream operations are complete + * \param userData - User-specified data to be passed to the function + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamWaitEvent, + * ::cuStreamDestroy, + * ::cuMemAllocManaged, + * ::cuStreamAttachMemAsync, + * ::cuStreamAddCallback + */ +CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData); + +#endif /* __CUDA_API_VERSION >= 10000 */ + +/** @} */ /* END CUDA_EXEC */ + +/** + * \defgroup CUDA_EXEC_DEPRECATED Execution Control [DEPRECATED] + * + * ___MANBRIEF___ deprecated execution control functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the deprecated execution control functions of the + * low-level CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Sets the block-dimensions for the function + * + * \deprecated + * + * Specifies the \p x, \p y, and \p z dimensions of the thread blocks that are + * created when the kernel given by \p hfunc is launched. + * + * \param hfunc - Kernel to specify dimensions of + * \param x - X dimension + * \param y - Y dimension + * \param z - Z dimension + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetSharedSize, + * ::cuFuncSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSeti, + * ::cuParamSetf, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z); + +/** + * \brief Sets the dynamic shared-memory size for the function + * + * \deprecated + * + * Sets through \p bytes the amount of dynamic shared memory that will be + * available to each thread block when the kernel given by \p hfunc is launched. + * + * \param hfunc - Kernel to specify dynamic shared-memory size for + * \param bytes - Dynamic shared-memory size per thread in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSeti, + * ::cuParamSetf, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes); + +/** + * \brief Sets the parameter size for the function + * + * \deprecated + * + * Sets through \p numbytes the total size in bytes needed by the function + * parameters of the kernel corresponding to \p hfunc. + * + * \param hfunc - Kernel to set parameter size for + * \param numbytes - Size of parameter list in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetf, + * ::cuParamSeti, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc, unsigned int numbytes); + +/** + * \brief Adds an integer parameter to the function's argument list + * + * \deprecated + * + * Sets an integer parameter that will be specified the next time the + * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset. + * + * \param hfunc - Kernel to add parameter to + * \param offset - Offset to add parameter to argument list + * \param value - Value of parameter + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSetf, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset, unsigned int value); + +/** + * \brief Adds a floating-point parameter to the function's argument list + * + * \deprecated + * + * Sets a floating-point parameter that will be specified the next time the + * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset. + * + * \param hfunc - Kernel to add parameter to + * \param offset - Offset to add parameter to argument list + * \param value - Value of parameter + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSeti, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset, float value); + +/** + * \brief Adds arbitrary data to the function's argument list + * + * \deprecated + * + * Copies an arbitrary amount of data (specified in \p numbytes) from \p ptr + * into the parameter space of the kernel corresponding to \p hfunc. \p offset + * is a byte offset. + * + * \param hfunc - Kernel to add data to + * \param offset - Offset to add data to argument list + * \param ptr - Pointer to arbitrary data + * \param numbytes - Size of data to copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSetf, + * ::cuParamSeti, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes); + +/** + * \brief Launches a CUDA function + * + * \deprecated + * + * Invokes the kernel \p f on a 1 x 1 x 1 grid of blocks. The block + * contains the number of threads specified by a previous call to + * ::cuFuncSetBlockShape(). + * + * \param f - Kernel to launch + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSetf, + * ::cuParamSeti, + * ::cuParamSetv, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f); + +/** + * \brief Launches a CUDA function + * + * \deprecated + * + * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of + * blocks. Each block contains the number of threads specified by a previous + * call to ::cuFuncSetBlockShape(). + * + * \param f - Kernel to launch + * \param grid_width - Width of grid in blocks + * \param grid_height - Height of grid in blocks + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSetf, + * ::cuParamSeti, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, int grid_height); + +/** + * \brief Launches a CUDA function + * + * \deprecated + * + * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of + * blocks. Each block contains the number of threads specified by a previous + * call to ::cuFuncSetBlockShape(). + * + * \param f - Kernel to launch + * \param grid_width - Width of grid in blocks + * \param grid_height - Height of grid in blocks + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * + * \note In certain cases where cubins are created with no ABI (i.e., using \p ptxas \p --abi-compile \p no), + * this function may serialize kernel launches. In order to force the CUDA driver to retain + * asynchronous behavior, set the ::CU_CTX_LMEM_RESIZE_TO_MAX flag during context creation (see ::cuCtxCreate). + * + * \note_null_stream + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSetf, + * ::cuParamSeti, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream); + + +/** + * \brief Adds a texture-reference to the function's argument list + * + * \deprecated + * + * Makes the CUDA array or linear memory bound to the texture reference + * \p hTexRef available to a device program as a texture. In this version of + * CUDA, the texture-reference must be obtained via ::cuModuleGetTexRef() and + * the \p texunit parameter must be set to ::CU_PARAM_TR_DEFAULT. + * + * \param hfunc - Kernel to add texture-reference to + * \param texunit - Texture unit (must be ::CU_PARAM_TR_DEFAULT) + * \param hTexRef - Texture-reference to add to argument list + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef); +/** @} */ /* END CUDA_EXEC_DEPRECATED */ + +#if __CUDA_API_VERSION >= 10000 +/** + * \defgroup CUDA_GRAPH Graph Management + * + * ___MANBRIEF___ graph management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the graph management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Creates a graph + * + * Creates an empty graph, which is returned via \p phGraph. + * + * \param phGraph - Returns newly created graph + * \param flags - Graph creation flags, must be 0 + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode, + * ::cuGraphInstantiate, + * ::cuGraphDestroy, + * ::cuGraphGetNodes, + * ::cuGraphGetRootNodes, + * ::cuGraphGetEdges, + * ::cuGraphClone + */ +CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags); + +/** + * \brief Creates a kernel execution node and adds it to a graph + * + * Creates a new kernel execution node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies and arguments specified in \p nodeParams. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * The CUDA_KERNEL_NODE_PARAMS structure is defined as: + * + * \code + * typedef struct CUDA_KERNEL_NODE_PARAMS_st { + * CUfunction func; + * unsigned int gridDimX; + * unsigned int gridDimY; + * unsigned int gridDimZ; + * unsigned int blockDimX; + * unsigned int blockDimY; + * unsigned int blockDimZ; + * unsigned int sharedMemBytes; + * void **kernelParams; + * void **extra; + * } CUDA_KERNEL_NODE_PARAMS; + * \endcode + * + * When the graph is launched, the node will invoke kernel \p func on a (\p gridDimX x + * \p gridDimY x \p gridDimZ) grid of blocks. Each block contains + * (\p blockDimX x \p blockDimY x \p blockDimZ) threads. + * + * \p sharedMemBytes sets the amount of dynamic shared memory that will be + * available to each thread block. + * + * Kernel parameters to \p func can be specified in one of two ways: + * + * 1) Kernel parameters can be specified via \p kernelParams. If the kernel has N + * parameters, then \p kernelParams needs to be an array of N pointers. Each pointer, + * from \p kernelParams[0] to \p kernelParams[N-1], points to the region of memory from which the actual + * parameter will be copied. The number of kernel parameters and their offsets and sizes do not need + * to be specified as that information is retrieved directly from the kernel's image. + * + * 2) Kernel parameters can also be packaged by the application into a single buffer that is passed in + * via \p extra. This places the burden on the application of knowing each kernel + * parameter's size and alignment/padding within the buffer. The \p extra parameter exists + * to allow this function to take additional less commonly used arguments. \p extra specifies + * a list of names of extra settings and their corresponding values. Each extra setting name is + * immediately followed by the corresponding value. The list must be terminated with either NULL or + * CU_LAUNCH_PARAM_END. + * + * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra + * array; + * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next + * value in \p extra will be a pointer to a buffer + * containing all the kernel parameters for launching kernel + * \p func; + * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next + * value in \p extra will be a pointer to a size_t + * containing the size of the buffer specified with + * ::CU_LAUNCH_PARAM_BUFFER_POINTER; + * + * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel parameters are specified with both + * \p kernelParams and \p extra (i.e. both \p kernelParams and + * \p extra are non-NULL). + * + * The \p kernelParams or \p extra array, as well as the argument values it points to, + * are copied during this call. + * + * \note Kernels launched using graphs must not use texture and surface references. Reading or + * writing through any texture or surface reference is undefined behavior. + * This restriction does not apply to texture and surface objects. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param nodeParams - Parameters for the GPU execution node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchKernel, + * ::cuGraphKernelNodeGetParams, + * ::cuGraphKernelNodeSetParams, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddKernelNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams); + +/** + * \brief Returns a kernel node's parameters + * + * Returns the parameters of kernel node \p hNode in \p nodeParams. + * The \p kernelParams or \p extra array returned in \p nodeParams, + * as well as the argument values it points to, are owned by the node. + * This memory remains valid until the node is destroyed or its + * parameters are modified, and should not be modified + * directly. Use ::cuGraphKernelNodeSetParams to update the + * parameters of this node. + * + * The params will contain either \p kernelParams or \p extra, + * according to which of these was most recently set on the node. + * + * \param hNode - Node to get the parameters for + * \param nodeParams - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchKernel, + * ::cuGraphAddKernelNode, + * ::cuGraphKernelNodeSetParams + */ +CUresult CUDAAPI cuGraphKernelNodeGetParams(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams); + +/** + * \brief Sets a kernel node's parameters + * + * Sets the parameters of kernel node \p hNode to \p nodeParams. + * + * \param hNode - Node to set the parameters for + * \param nodeParams - Parameters to copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchKernel, + * ::cuGraphAddKernelNode, + * ::cuGraphKernelNodeGetParams + */ +CUresult CUDAAPI cuGraphKernelNodeSetParams(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams); + +/** + * \brief Creates a memcpy node and adds it to a graph + * + * Creates a new memcpy node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * When the graph is launched, the node will perform the memcpy described by \p copyParams. + * See ::cuMemcpy3D() for a description of the structure and its restrictions. + * + * Memcpy nodes have some additional restrictions with regards to managed memory, if the + * system contains at least one device which has a zero value for the device attribute + * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. If one or more of the operands refer + * to managed memory, then using the memory type ::CU_MEMORYTYPE_UNIFIED is disallowed + * for those operand(s). The managed memory will be treated as residing on either the + * host or the device, depending on which memory type is specified. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param copyParams - Parameters for the memory copy + * \param ctx - Context on which to run the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuMemcpy3D, + * ::cuGraphMemcpyNodeGetParams, + * ::cuGraphMemcpyNodeSetParams, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D *copyParams, CUcontext ctx); + +/** + * \brief Returns a memcpy node's parameters + * + * Returns the parameters of memcpy node \p hNode in \p nodeParams. + * + * \param hNode - Node to get the parameters for + * \param nodeParams - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuMemcpy3D, + * ::cuGraphAddMemcpyNode, + * ::cuGraphMemcpyNodeSetParams + */ +CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode, CUDA_MEMCPY3D *nodeParams); + +/** + * \brief Sets a memcpy node's parameters + * + * Sets the parameters of memcpy node \p hNode to \p nodeParams. + * + * \param hNode - Node to set the parameters for + * \param nodeParams - Parameters to copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuMemcpy3D, + * ::cuGraphAddMemcpyNode, + * ::cuGraphMemcpyNodeGetParams + */ +CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode, const CUDA_MEMCPY3D *nodeParams); + +/** + * \brief Creates a memset node and adds it to a graph + * + * Creates a new memset node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * The element size must be 1, 2, or 4 bytes. + * When the graph is launched, the node will perform the memset described by \p memsetParams. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param memsetParams - Parameters for the memory set + * \param ctx - Context on which to run the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_CONTEXT + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuMemsetD2D32, + * ::cuGraphMemsetNodeGetParams, + * ::cuGraphMemsetNodeSetParams, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemcpyNode + */ +CUresult CUDAAPI cuGraphAddMemsetNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx); + +/** + * \brief Returns a memset node's parameters + * + * Returns the parameters of memset node \p hNode in \p nodeParams. + * + * \param hNode - Node to get the parameters for + * \param nodeParams - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuMemsetD2D32, + * ::cuGraphAddMemsetNode, + * ::cuGraphMemsetNodeSetParams + */ +CUresult CUDAAPI cuGraphMemsetNodeGetParams(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams); + +/** + * \brief Sets a memset node's parameters + * + * Sets the parameters of memset node \p hNode to \p nodeParams. + * + * \param hNode - Node to set the parameters for + * \param nodeParams - Parameters to copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuMemsetD2D32, + * ::cuGraphAddMemsetNode, + * ::cuGraphMemsetNodeGetParams + */ +CUresult CUDAAPI cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams); + +/** + * \brief Creates a host execution node and adds it to a graph + * + * Creates a new CPU execution node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies and arguments specified in \p nodeParams. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * When the graph is launched, the node will invoke the specified CPU function. + * Host nodes are not supported under MPS with pre-Volta GPUs. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param nodeParams - Parameters for the host node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchHostFunc, + * ::cuGraphHostNodeGetParams, + * ::cuGraphHostNodeSetParams, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS *nodeParams); + +/** + * \brief Returns a host node's parameters + * + * Returns the parameters of host node \p hNode in \p nodeParams. + * + * \param hNode - Node to get the parameters for + * \param nodeParams - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchHostFunc, + * ::cuGraphAddHostNode, + * ::cuGraphHostNodeSetParams + */ +CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS *nodeParams); + +/** + * \brief Sets a host node's parameters + * + * Sets the parameters of host node \p hNode to \p nodeParams. + * + * \param hNode - Node to set the parameters for + * \param nodeParams - Parameters to copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchHostFunc, + * ::cuGraphAddHostNode, + * ::cuGraphHostNodeGetParams + */ +CUresult CUDAAPI cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams); + +/** + * \brief Creates a child graph node and adds it to a graph + * + * Creates a new node which executes an embedded graph, and adds it to \p hGraph with + * \p numDependencies dependencies specified via \p dependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * The node executes an embedded child graph. The child graph is cloned in this call. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param childGraph - The graph to clone into this node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphChildGraphNodeGetGraph, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode, + * ::cuGraphClone + */ +CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph); + +/** + * \brief Gets a handle to the embedded graph of a child graph node + * + * Gets a handle to the embedded graph in a child graph node. This call + * does not clone the graph. Changes to the graph will be reflected in + * the node, and the node retains ownership of the graph. + * + * \param hNode - Node to get the embedded graph for + * \param phGraph - Location to store a handle to the graph + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddChildGraphNode, + * ::cuGraphNodeFindInClone + */ +CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode, CUgraph *phGraph); + +/** + * \brief Creates an empty node and adds it to a graph + * + * Creates a new node which performs no operation, and adds it to \p hGraph with + * \p numDependencies dependencies specified via \p dependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * An empty node performs no operation during execution, but can be used for + * transitive ordering. For example, a phased execution graph with 2 groups of n + * nodes with a barrier between them can be represented using an empty node and + * 2*n dependency edges, rather than no empty node and n^2 dependency edges. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies); + +/** + * \brief Clones a graph + * + * This function creates a copy of \p originalGraph and returns it in \p * phGraphClone. + * All parameters are copied into the cloned graph. The original graph may be modified + * after this call without affecting the clone. + * + * Child graph nodes in the original graph are recursively copied into the clone. + * + * \param phGraphClone - Returns newly created cloned graph + * \param originalGraph - Graph to clone + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphCreate, + * ::cuGraphNodeFindInClone + */ +CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph); + +/** + * \brief Finds a cloned version of a node + * + * This function returns the node in \p hClonedGraph corresponding to \p hOriginalNode + * in the original graph. + * + * \p hClonedGraph must have been cloned from \p hOriginalGraph via ::cuGraphClone. + * \p hOriginalNode must have been in \p hOriginalGraph at the time of the call to + * ::cuGraphClone, and the corresponding cloned node in \p hClonedGraph must not have + * been removed. The cloned node is then returned via \p phClonedNode. + * + * \param phNode - Returns handle to the cloned node + * \param hOriginalNode - Handle to the original node + * \param hClonedGraph - Cloned graph to query + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphClone + */ +CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph); + +/** + * \brief Returns a node's type + * + * Returns the node type of \p hNode in \p type. + * + * \param hNode - Node to query + * \param type - Pointer to return the node type + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphGetNodes, + * ::cuGraphGetRootNodes, + * ::cuGraphChildGraphNodeGetGraph, + * ::cuGraphKernelNodeGetParams, + * ::cuGraphKernelNodeSetParams, + * ::cuGraphHostNodeGetParams, + * ::cuGraphHostNodeSetParams, + * ::cuGraphMemcpyNodeGetParams, + * ::cuGraphMemcpyNodeSetParams, + * ::cuGraphMemsetNodeGetParams, + * ::cuGraphMemsetNodeSetParams + */ +CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type); + +/** + * \brief Returns a graph's nodes + * + * Returns a list of \p hGraph's nodes. \p nodes may be NULL, in which case this + * function will return the number of nodes in \p numNodes. Otherwise, + * \p numNodes entries will be filled in. If \p numNodes is higher than the actual + * number of nodes, the remaining entries in \p nodes will be set to NULL, and the + * number of nodes actually obtained will be returned in \p numNodes. + * + * \param hGraph - Graph to query + * \param nodes - Pointer to return the nodes + * \param numNodes - See description + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphCreate, + * ::cuGraphGetRootNodes, + * ::cuGraphGetEdges, + * ::cuGraphNodeGetType, + * ::cuGraphNodeGetDependencies, + * ::cuGraphNodeGetDependentNodes + */ +CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes); + +/** + * \brief Returns a graph's root nodes + * + * Returns a list of \p hGraph's root nodes. \p rootNodes may be NULL, in which case this + * function will return the number of root nodes in \p numRootNodes. Otherwise, + * \p numRootNodes entries will be filled in. If \p numRootNodes is higher than the actual + * number of root nodes, the remaining entries in \p rootNodes will be set to NULL, and the + * number of nodes actually obtained will be returned in \p numRootNodes. + * + * \param hGraph - Graph to query + * \param rootNodes - Pointer to return the root nodes + * \param numRootNodes - See description + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphCreate, + * ::cuGraphGetNodes, + * ::cuGraphGetEdges, + * ::cuGraphNodeGetType, + * ::cuGraphNodeGetDependencies, + * ::cuGraphNodeGetDependentNodes + */ +CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes); + +/** + * \brief Returns a graph's dependency edges + * + * Returns a list of \p hGraph's dependency edges. Edges are returned via corresponding + * indices in \p from and \p to; that is, the node in \p to[i] has a dependency on the + * node in \p from[i]. \p from and \p to may both be NULL, in which + * case this function only returns the number of edges in \p numEdges. Otherwise, + * \p numEdges entries will be filled in. If \p numEdges is higher than the actual + * number of edges, the remaining entries in \p from and \p to will be set to NULL, and + * the number of edges actually returned will be written to \p numEdges. + * + * \param hGraph - Graph to get the edges from + * \param from - Location to return edge endpoints + * \param to - Location to return edge endpoints + * \param numEdges - See description + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphGetNodes, + * ::cuGraphGetRootNodes, + * ::cuGraphAddDependencies, + * ::cuGraphRemoveDependencies, + * ::cuGraphNodeGetDependencies, + * ::cuGraphNodeGetDependentNodes + */ +CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges); + +/** + * \brief Returns a node's dependencies + * + * Returns a list of \p node's dependencies. \p dependencies may be NULL, in which case this + * function will return the number of dependencies in \p numDependencies. Otherwise, + * \p numDependencies entries will be filled in. If \p numDependencies is higher than the actual + * number of dependencies, the remaining entries in \p dependencies will be set to NULL, and the + * number of nodes actually obtained will be returned in \p numDependencies. + * + * \param hNode - Node to query + * \param dependencies - Pointer to return the dependencies + * \param numDependencies - See description + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphNodeGetDependentNodes, + * ::cuGraphGetNodes, + * ::cuGraphGetRootNodes, + * ::cuGraphGetEdges, + * ::cuGraphAddDependencies, + * ::cuGraphRemoveDependencies + */ +CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies); + +/** + * \brief Returns a node's dependent nodes + * + * Returns a list of \p node's dependent nodes. \p dependentNodes may be NULL, in which + * case this function will return the number of dependent nodes in \p numDependentNodes. + * Otherwise, \p numDependentNodes entries will be filled in. If \p numDependentNodes is + * higher than the actual number of dependent nodes, the remaining entries in + * \p dependentNodes will be set to NULL, and the number of nodes actually obtained will + * be returned in \p numDependentNodes. + * + * \param hNode - Node to query + * \param dependentNodes - Pointer to return the dependent nodes + * \param numDependentNodes - See description + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphNodeGetDependencies, + * ::cuGraphGetNodes, + * ::cuGraphGetRootNodes, + * ::cuGraphGetEdges, + * ::cuGraphAddDependencies, + * ::cuGraphRemoveDependencies + */ +CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes); + +/** + * \brief Adds dependency edges to a graph + * + * The number of dependencies to be added is defined by \p numDependencies + * Elements in \p from and \p to at corresponding indices define a dependency. + * Each node in \p from and \p to must belong to \p hGraph. + * + * If \p numDependencies is 0, elements in \p from and \p to will be ignored. + * Specifying an existing dependency will return an error. + * + * \param hGraph - Graph to which dependencies are added + * \param from - Array of nodes that provide the dependencies + * \param to - Array of dependent nodes + * \param numDependencies - Number of dependencies to be added + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphRemoveDependencies, + * ::cuGraphGetEdges, + * ::cuGraphNodeGetDependencies, + * ::cuGraphNodeGetDependentNodes + */ +CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies); + +/** + * \brief Removes dependency edges from a graph + * + * The number of \p dependencies to be removed is defined by \p numDependencies. + * Elements in \p from and \p to at corresponding indices define a dependency. + * Each node in \p from and \p to must belong to \p hGraph. + * + * If \p numDependencies is 0, elements in \p from and \p to will be ignored. + * Specifying a non-existing dependency will return an error. + * + * \param hGraph - Graph from which to remove dependencies + * \param from - Array of nodes that provide the dependencies + * \param to - Array of dependent nodes + * \param numDependencies - Number of dependencies to be removed + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddDependencies, + * ::cuGraphGetEdges, + * ::cuGraphNodeGetDependencies, + * ::cuGraphNodeGetDependentNodes + */ +CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies); + +/** + * \brief Remove a node from the graph + * + * Removes \p hNode from its graph. This operation also severs any dependencies of other nodes + * on \p hNode and vice versa. + * + * \param hNode - Node to remove + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode); + +/** + * \brief Creates an executable graph from a graph + * + * Instantiates \p hGraph as an executable graph. The graph is validated for any + * structural constraints or intra-node constraints which were not previously + * validated. If instantiation is successful, a handle to the instantiated graph + * is returned in \p graphExec. + * + * If there are any errors, diagnostic information may be returned in \p errorNode and + * \p logBuffer. This is the primary way to inspect instantiation errors. The output + * will be null terminated unless the diagnostics overflow + * the buffer. In this case, they will be truncated, and the last byte can be + * inspected to determine if truncation occurred. + * + * \param phGraphExec - Returns instantiated graph + * \param hGraph - Graph to instantiate + * \param phErrorNode - In case of an instantiation error, this may be modified to + * indicate a node contributing to the error + * \param logBuffer - A character buffer to store diagnostic messages + * \param bufferSize - Size of the log buffer in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphCreate, + * ::cuGraphLaunch, + * ::cuGraphExecDestroy + */ +CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize); + + +#if __CUDA_API_VERSION >= 10010 +/** + * \brief Sets the parameters for a kernel node in the given graphExec + * + * Sets the parameters of a kernel node in an executable graph \p hGraphExec. + * The node is identified by the corresponding node \p hNode in the + * non-executable graph, from which the executable graph was instantiated. + * + * \p hNode must not have been removed from the original graph. The \p func field + * of \p nodeParams cannot be modified and must match the original value. + * All other values can be modified. + * + * The modifications take effect at the next launch of \p hGraphExec. Already + * enqueued or running launches of \p hGraphExec are not affected by this call. + * \p hNode is also not modified by this call. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - kernel node from the graph from which graphExec was instantiated + * \param nodeParams - Updated Parameters to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddKernelNode, + * ::cuGraphKernelNodeSetParams, + * ::cuGraphInstantiate + */ + CUresult CUDAAPI cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams); + +#endif /* __CUDA_API_VERSION >= 10010 */ + +/** + * \brief Launches an executable graph in a stream + * + * Executes \p hGraphExec in \p hStream. Only one instance of \p hGraphExec may be executing + * at a time. Each launch is ordered behind both any previous work in \p hStream + * and any previous launches of \p hGraphExec. To execute a graph concurrently, it must be + * instantiated multiple times into multiple executable graphs. + * + * \param hGraphExec - Executable graph to launch + * \param hStream - Stream in which to launch the graph + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphInstantiate, + * ::cuGraphExecDestroy + */ +CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream); + +/** + * \brief Destroys an executable graph + * + * Destroys the executable graph specified by \p hGraphExec, as well + * as all of its executable nodes. If the executable graph is + * in-flight, it will not be terminated, but rather freed + * asynchronously on completion. + * + * \param hGraphExec - Executable graph to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphInstantiate, + * ::cuGraphLaunch + */ +CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec); + +/** + * \brief Destroys a graph + * + * Destroys the graph specified by \p hGraph, as well as all of its nodes. + * + * \param hGraph - Graph to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphCreate + */ +CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph); +/** @} */ /* END CUDA_GRAPH */ +#endif /* __CUDA_API_VERSION >= 10000 */ + +#if __CUDA_API_VERSION >= 6050 +/** + * \defgroup CUDA_OCCUPANCY Occupancy + * + * ___MANBRIEF___ occupancy calculation functions of the low-level CUDA driver + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the occupancy calculation functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns occupancy of a function + * + * Returns in \p *numBlocks the number of the maximum active blocks per + * streaming multiprocessor. + * + * \param numBlocks - Returned occupancy + * \param func - Kernel for which occupancy is calculated + * \param blockSize - Block size the kernel is intended to be launched with + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + */ +CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize); + +/** + * \brief Returns occupancy of a function + * + * Returns in \p *numBlocks the number of the maximum active blocks per + * streaming multiprocessor. + * + * The \p Flags parameter controls how special cases are handled. The + * valid flags are: + * + * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as + * ::cuOccupancyMaxActiveBlocksPerMultiprocessor; + * + * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the + * default behavior on platform where global caching affects + * occupancy. On such platforms, if caching is enabled, but + * per-block SM resource usage would result in zero occupancy, the + * occupancy calculator will calculate the occupancy as if caching + * is disabled. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE makes + * the occupancy calculator to return 0 in such cases. More information + * can be found about this feature in the "Unified L1/Texture Cache" + * section of the Maxwell tuning guide. + * + * \param numBlocks - Returned occupancy + * \param func - Kernel for which occupancy is calculated + * \param blockSize - Block size the kernel is intended to be launched with + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * \param flags - Requested behavior for the occupancy calculator + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + */ +CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags); + +/** + * \brief Suggest a launch configuration with reasonable occupancy + * + * Returns in \p *blockSize a reasonable block size that can achieve + * the maximum occupancy (or, the maximum number of active warps with + * the fewest blocks per multiprocessor), and in \p *minGridSize the + * minimum grid size to achieve the maximum occupancy. + * + * If \p blockSizeLimit is 0, the configurator will use the maximum + * block size permitted by the device / function instead. + * + * If per-block dynamic shared memory allocation is not needed, the + * user should leave both \p blockSizeToDynamicSMemSize and \p + * dynamicSMemSize as 0. + * + * If per-block dynamic shared memory allocation is needed, then if + * the dynamic shared memory size is constant regardless of block + * size, the size should be passed through \p dynamicSMemSize, and \p + * blockSizeToDynamicSMemSize should be NULL. + * + * Otherwise, if the per-block dynamic shared memory size varies with + * different block sizes, the user needs to provide a unary function + * through \p blockSizeToDynamicSMemSize that computes the dynamic + * shared memory needed by \p func for any given block size. \p + * dynamicSMemSize is ignored. An example signature is: + * + * \code + * // Take block size, returns dynamic shared memory needed + * size_t blockToSmem(int blockSize); + * \endcode + * + * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy + * \param blockSize - Returned maximum block size that can achieve the maximum occupancy + * \param func - Kernel for which launch configuration is calculated + * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size + * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes + * \param blockSizeLimit - The maximum block size \p func is designed to handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cudaOccupancyMaxPotentialBlockSize + */ +CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit); + +/** + * \brief Suggest a launch configuration with reasonable occupancy + * + * An extended version of ::cuOccupancyMaxPotentialBlockSize. In + * addition to arguments passed to ::cuOccupancyMaxPotentialBlockSize, + * ::cuOccupancyMaxPotentialBlockSizeWithFlags also takes a \p Flags + * parameter. + * + * The \p Flags parameter controls how special cases are handled. The + * valid flags are: + * + * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as + * ::cuOccupancyMaxPotentialBlockSize; + * + * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the + * default behavior on platform where global caching affects + * occupancy. On such platforms, the launch configurations that + * produces maximal occupancy might not support global + * caching. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE + * guarantees that the the produced launch configuration is global + * caching compatible at a potential cost of occupancy. More information + * can be found about this feature in the "Unified L1/Texture Cache" + * section of the Maxwell tuning guide. + * + * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy + * \param blockSize - Returned maximum block size that can achieve the maximum occupancy + * \param func - Kernel for which launch configuration is calculated + * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size + * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes + * \param blockSizeLimit - The maximum block size \p func is designed to handle + * \param flags - Options + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cudaOccupancyMaxPotentialBlockSizeWithFlags + */ +CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags); + +/** @} */ /* END CUDA_OCCUPANCY */ +#endif /* __CUDA_API_VERSION >= 6050 */ + +/** + * \defgroup CUDA_TEXREF_DEPRECATED Texture Reference Management [DEPRECATED] + * + * ___MANBRIEF___ deprecated texture reference management functions of the + * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the deprecated texture reference management + * functions of the low-level CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Binds an array as a texture reference + * + * \deprecated + * + * Binds the CUDA array \p hArray to the texture reference \p hTexRef. Any + * previous address or CUDA array state associated with the texture reference + * is superseded by this function. \p Flags must be set to + * ::CU_TRSA_OVERRIDE_FORMAT. Any CUDA array previously bound to \p hTexRef is + * unbound. + * + * \param hTexRef - Texture reference to bind + * \param hArray - Array to bind + * \param Flags - Options (must be ::CU_TRSA_OVERRIDE_FORMAT) + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTextureToArray + */ +CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags); + +/** + * \brief Binds a mipmapped array to a texture reference + * + * \deprecated + * + * Binds the CUDA mipmapped array \p hMipmappedArray to the texture reference \p hTexRef. + * Any previous address or CUDA array state associated with the texture reference + * is superseded by this function. \p Flags must be set to ::CU_TRSA_OVERRIDE_FORMAT. + * Any CUDA array previously bound to \p hTexRef is unbound. + * + * \param hTexRef - Texture reference to bind + * \param hMipmappedArray - Mipmapped array to bind + * \param Flags - Options (must be ::CU_TRSA_OVERRIDE_FORMAT) + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTextureToMipmappedArray + */ +CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags); + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Binds an address as a texture reference + * + * \deprecated + * + * Binds a linear address range to the texture reference \p hTexRef. Any + * previous address or CUDA array state associated with the texture reference + * is superseded by this function. Any memory previously bound to \p hTexRef + * is unbound. + * + * Since the hardware enforces an alignment requirement on texture base + * addresses, ::cuTexRefSetAddress() passes back a byte offset in + * \p *ByteOffset that must be applied to texture fetches in order to read from + * the desired memory. This offset must be divided by the texel size and + * passed to kernels that read from the texture so they can be applied to the + * ::tex1Dfetch() function. + * + * If the device memory pointer was returned from ::cuMemAlloc(), the offset + * is guaranteed to be 0 and NULL may be passed as the \p ByteOffset parameter. + * + * The total number of elements (or texels) in the linear address range + * cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. + * The number of elements is computed as (\p bytes / bytesPerElement), + * where bytesPerElement is determined from the data format and number of + * components set using ::cuTexRefSetFormat(). + * + * \param ByteOffset - Returned byte offset + * \param hTexRef - Texture reference to bind + * \param dptr - Device pointer to bind + * \param bytes - Size of memory to bind in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTexture + */ +CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes); + +/** + * \brief Binds an address as a 2D texture reference + * + * \deprecated + * + * Binds a linear address range to the texture reference \p hTexRef. Any + * previous address or CUDA array state associated with the texture reference + * is superseded by this function. Any memory previously bound to \p hTexRef + * is unbound. + * + * Using a ::tex2D() function inside a kernel requires a call to either + * ::cuTexRefSetArray() to bind the corresponding texture reference to an + * array, or ::cuTexRefSetAddress2D() to bind the texture reference to linear + * memory. + * + * Function calls to ::cuTexRefSetFormat() cannot follow calls to + * ::cuTexRefSetAddress2D() for the same texture reference. + * + * It is required that \p dptr be aligned to the appropriate hardware-specific + * texture alignment. You can query this value using the device attribute + * ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. If an unaligned \p dptr is + * supplied, ::CUDA_ERROR_INVALID_VALUE is returned. + * + * \p Pitch has to be aligned to the hardware-specific texture pitch alignment. + * This value can be queried using the device attribute + * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. If an unaligned \p Pitch is + * supplied, ::CUDA_ERROR_INVALID_VALUE is returned. + * + * Width and Height, which are specified in elements (or texels), cannot exceed + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively. + * \p Pitch, which is specified in bytes, cannot exceed + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH. + * + * \param hTexRef - Texture reference to bind + * \param desc - Descriptor of CUDA array + * \param dptr - Device pointer to bind + * \param Pitch - Line pitch in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTexture2D + */ +CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); +#endif /* __CUDA_API_VERSION >= 3020 */ + +/** + * \brief Sets the format for a texture reference + * + * \deprecated + * + * Specifies the format of the data to be read by the texture reference + * \p hTexRef. \p fmt and \p NumPackedComponents are exactly analogous to the + * ::Format and ::NumChannels members of the ::CUDA_ARRAY_DESCRIPTOR structure: + * They specify the format of each component and the number of components per + * array element. + * + * \param hTexRef - Texture reference + * \param fmt - Format to set + * \param NumPackedComponents - Number of components per array element + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaCreateChannelDesc, + * ::cudaBindTexture, + * ::cudaBindTexture2D, + * ::cudaBindTextureToArray, + * ::cudaBindTextureToMipmappedArray + */ +CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents); + +/** + * \brief Sets the addressing mode for a texture reference + * + * \deprecated + * + * Specifies the addressing mode \p am for the given dimension \p dim of the + * texture reference \p hTexRef. If \p dim is zero, the addressing mode is + * applied to the first parameter of the functions used to fetch from the + * texture; if \p dim is 1, the second, and so on. ::CUaddress_mode is defined + * as: + * \code + typedef enum CUaddress_mode_enum { + CU_TR_ADDRESS_MODE_WRAP = 0, + CU_TR_ADDRESS_MODE_CLAMP = 1, + CU_TR_ADDRESS_MODE_MIRROR = 2, + CU_TR_ADDRESS_MODE_BORDER = 3 + } CUaddress_mode; + * \endcode + * + * Note that this call has no effect if \p hTexRef is bound to linear memory. + * Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES, is not set, the only + * supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP. + * + * \param hTexRef - Texture reference + * \param dim - Dimension + * \param am - Addressing mode to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTexture, + * ::cudaBindTexture2D, + * ::cudaBindTextureToArray, + * ::cudaBindTextureToMipmappedArray + */ +CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am); + +/** + * \brief Sets the filtering mode for a texture reference + * + * \deprecated + * + * Specifies the filtering mode \p fm to be used when reading memory through + * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as: + * + * \code + typedef enum CUfilter_mode_enum { + CU_TR_FILTER_MODE_POINT = 0, + CU_TR_FILTER_MODE_LINEAR = 1 + } CUfilter_mode; + * \endcode + * + * Note that this call has no effect if \p hTexRef is bound to linear memory. + * + * \param hTexRef - Texture reference + * \param fm - Filtering mode to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTextureToArray + */ +CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm); + +/** + * \brief Sets the mipmap filtering mode for a texture reference + * + * \deprecated + * + * Specifies the mipmap filtering mode \p fm to be used when reading memory through + * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as: + * + * \code + typedef enum CUfilter_mode_enum { + CU_TR_FILTER_MODE_POINT = 0, + CU_TR_FILTER_MODE_LINEAR = 1 + } CUfilter_mode; + * \endcode + * + * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. + * + * \param hTexRef - Texture reference + * \param fm - Filtering mode to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTextureToMipmappedArray + */ +CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm); + +/** + * \brief Sets the mipmap level bias for a texture reference + * + * \deprecated + * + * Specifies the mipmap level bias \p bias to be added to the specified mipmap level when + * reading memory through the texture reference \p hTexRef. + * + * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. + * + * \param hTexRef - Texture reference + * \param bias - Mipmap level bias + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTextureToMipmappedArray + */ +CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias); + +/** + * \brief Sets the mipmap min/max mipmap level clamps for a texture reference + * + * \deprecated + * + * Specifies the min/max mipmap level clamps, \p minMipmapLevelClamp and \p maxMipmapLevelClamp + * respectively, to be used when reading memory through the texture reference + * \p hTexRef. + * + * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. + * + * \param hTexRef - Texture reference + * \param minMipmapLevelClamp - Mipmap min level clamp + * \param maxMipmapLevelClamp - Mipmap max level clamp + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTextureToMipmappedArray + */ +CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp); + +/** + * \brief Sets the maximum anisotropy for a texture reference + * + * \deprecated + * + * Specifies the maximum anisotropy \p maxAniso to be used when reading memory through + * the texture reference \p hTexRef. + * + * Note that this call has no effect if \p hTexRef is bound to linear memory. + * + * \param hTexRef - Texture reference + * \param maxAniso - Maximum anisotropy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTextureToArray, + * ::cudaBindTextureToMipmappedArray + */ +CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso); + +/** + * \brief Sets the border color for a texture reference + * + * \deprecated + * + * Specifies the value of the RGBA color via the \p pBorderColor to the texture reference + * \p hTexRef. The color value supports only float type and holds color components in + * the following sequence: + * pBorderColor[0] holds 'R' component + * pBorderColor[1] holds 'G' component + * pBorderColor[2] holds 'B' component + * pBorderColor[3] holds 'A' component + * + * Note that the color values can be set only when the Address mode is set to + * CU_TR_ADDRESS_MODE_BORDER using ::cuTexRefSetAddressMode. + * Applications using integer border color values have to "reinterpret_cast" their values to float. + * + * \param hTexRef - Texture reference + * \param pBorderColor - RGBA color + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddressMode, + * ::cuTexRefGetAddressMode, ::cuTexRefGetBorderColor, + * ::cudaBindTexture, + * ::cudaBindTexture2D, + * ::cudaBindTextureToArray, + * ::cudaBindTextureToMipmappedArray + */ +CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor); + +/** + * \brief Sets the flags for a texture reference + * + * \deprecated + * + * Specifies optional flags via \p Flags to specify the behavior of data + * returned through the texture reference \p hTexRef. The valid flags are: + * + * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of + * having the texture promote integer data to floating point data in the + * range [0, 1]. Note that texture with 32-bit integer format + * would not be promoted, regardless of whether or not this + * flag is specified; + * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the + * default behavior of having the texture coordinates range + * from [0, Dim) where Dim is the width or height of the CUDA + * array. Instead, the texture coordinates [0, 1.0) reference + * the entire breadth of the array dimension; + * + * \param hTexRef - Texture reference + * \param Flags - Optional flags to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTexture, + * ::cudaBindTexture2D, + * ::cudaBindTextureToArray, + * ::cudaBindTextureToMipmappedArray + */ +CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags); + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Gets the address associated with a texture reference + * + * \deprecated + * + * Returns in \p *pdptr the base address bound to the texture reference + * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference + * is not bound to any device memory range. + * + * \param pdptr - Returned device address + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef); +#endif /* __CUDA_API_VERSION >= 3020 */ + +/** + * \brief Gets the array bound to a texture reference + * + * \deprecated + * + * Returns in \p *phArray the CUDA array bound to the texture reference + * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference + * is not bound to any CUDA array. + * + * \param phArray - Returned array + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef); + +/** + * \brief Gets the mipmapped array bound to a texture reference + * + * \deprecated + * + * Returns in \p *phMipmappedArray the CUDA mipmapped array bound to the texture + * reference \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference + * is not bound to any CUDA mipmapped array. + * + * \param phMipmappedArray - Returned mipmapped array + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef); + +/** + * \brief Gets the addressing mode used by a texture reference + * + * \deprecated + * + * Returns in \p *pam the addressing mode corresponding to the + * dimension \p dim of the texture reference \p hTexRef. Currently, the only + * valid value for \p dim are 0 and 1. + * + * \param pam - Returned addressing mode + * \param hTexRef - Texture reference + * \param dim - Dimension + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim); + +/** + * \brief Gets the filter-mode used by a texture reference + * + * \deprecated + * + * Returns in \p *pfm the filtering mode of the texture reference + * \p hTexRef. + * + * \param pfm - Returned filtering mode + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); + +/** + * \brief Gets the format used by a texture reference + * + * \deprecated + * + * Returns in \p *pFormat and \p *pNumChannels the format and number + * of components of the CUDA array bound to the texture reference \p hTexRef. + * If \p pFormat or \p pNumChannels is NULL, it will be ignored. + * + * \param pFormat - Returned format + * \param pNumChannels - Returned number of components + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags + */ +CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef); + +/** + * \brief Gets the mipmap filtering mode for a texture reference + * + * \deprecated + * + * Returns the mipmap filtering mode in \p pfm that's used when reading memory through + * the texture reference \p hTexRef. + * + * \param pfm - Returned mipmap filtering mode + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); + +/** + * \brief Gets the mipmap level bias for a texture reference + * + * \deprecated + * + * Returns the mipmap level bias in \p pBias that's added to the specified mipmap + * level when reading memory through the texture reference \p hTexRef. + * + * \param pbias - Returned mipmap level bias + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef); + +/** + * \brief Gets the min/max mipmap level clamps for a texture reference + * + * \deprecated + * + * Returns the min/max mipmap level clamps in \p pminMipmapLevelClamp and \p pmaxMipmapLevelClamp + * that's used when reading memory through the texture reference \p hTexRef. + * + * \param pminMipmapLevelClamp - Returned mipmap min level clamp + * \param pmaxMipmapLevelClamp - Returned mipmap max level clamp + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef); + +/** + * \brief Gets the maximum anisotropy for a texture reference + * + * \deprecated + * + * Returns the maximum anisotropy in \p pmaxAniso that's used when reading memory through + * the texture reference \p hTexRef. + * + * \param pmaxAniso - Returned maximum anisotropy + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef); + +/** + * \brief Gets the border color used by a texture reference + * + * \deprecated + * + * Returns in \p pBorderColor, values of the RGBA color used by + * the texture reference \p hTexRef. + * The color value is of type float and holds color components in + * the following sequence: + * pBorderColor[0] holds 'R' component + * pBorderColor[1] holds 'G' component + * pBorderColor[2] holds 'B' component + * pBorderColor[3] holds 'A' component + * + * \param hTexRef - Texture reference + * \param pBorderColor - Returned Type and Value of RGBA color + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddressMode, + * ::cuTexRefSetAddressMode, ::cuTexRefSetBorderColor + */ +CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef); + +/** + * \brief Gets the flags used by a texture reference + * + * \deprecated + * + * Returns in \p *pFlags the flags of the texture reference \p hTexRef. + * + * \param pFlags - Returned flags + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef); + +/** + * \brief Creates a texture reference + * + * \deprecated + * + * Creates a texture reference and returns its handle in \p *pTexRef. Once + * created, the application must call ::cuTexRefSetArray() or + * ::cuTexRefSetAddress() to associate the reference with allocated memory. + * Other texture reference functions are used to specify the format and + * interpretation (addressing, filtering, etc.) to be used when the memory is + * read through this texture reference. + * + * \param pTexRef - Returned texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefDestroy + */ +CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef); + +/** + * \brief Destroys a texture reference + * + * \deprecated + * + * Destroys the texture reference specified by \p hTexRef. + * + * \param hTexRef - Texture reference to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefCreate + */ +CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef); + +/** @} */ /* END CUDA_TEXREF_DEPRECATED */ + + +/** + * \defgroup CUDA_SURFREF_DEPRECATED Surface Reference Management [DEPRECATED] + * + * ___MANBRIEF___ surface reference management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the surface reference management functions of the + * low-level CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Sets the CUDA array for a surface reference. + * + * \deprecated + * + * Sets the CUDA array \p hArray to be read and written by the surface reference + * \p hSurfRef. Any previous CUDA array state associated with the surface + * reference is superseded by this function. \p Flags must be set to 0. + * The ::CUDA_ARRAY3D_SURFACE_LDST flag must have been set for the CUDA array. + * Any CUDA array previously bound to \p hSurfRef is unbound. + + * \param hSurfRef - Surface reference handle + * \param hArray - CUDA array handle + * \param Flags - set to 0 + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuModuleGetSurfRef, + * ::cuSurfRefGetArray, + * ::cudaBindSurfaceToArray + */ +CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags); + +/** + * \brief Passes back the CUDA array bound to a surface reference. + * + * \deprecated + * + * Returns in \p *phArray the CUDA array bound to the surface reference + * \p hSurfRef, or returns ::CUDA_ERROR_INVALID_VALUE if the surface reference + * is not bound to any CUDA array. + + * \param phArray - Surface reference handle + * \param hSurfRef - Surface reference handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuModuleGetSurfRef, ::cuSurfRefSetArray + */ +CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef); + +/** @} */ /* END CUDA_SURFREF_DEPRECATED */ + +#if __CUDA_API_VERSION >= 5000 +/** + * \defgroup CUDA_TEXOBJECT Texture Object Management + * + * ___MANBRIEF___ texture object management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the texture object management functions of the + * low-level CUDA driver application programming interface. The texture + * object API is only supported on devices of compute capability 3.0 or higher. + * + * @{ + */ + +/** + * \brief Creates a texture object + * + * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes + * the data to texture from. \p pTexDesc describes how the data should be sampled. + * \p pResViewDesc is an optional argument that specifies an alternate format for + * the data described by \p pResDesc, and also describes the subresource region + * to restrict access to when texturing. \p pResViewDesc can only be specified if + * the type of resource is a CUDA array or a CUDA mipmapped array. + * + * Texture objects are only supported on devices of compute capability 3.0 or higher. + * Additionally, a texture object is an opaque value, and, as such, should only be + * accessed through CUDA API calls. + * + * The ::CUDA_RESOURCE_DESC structure is defined as: + * \code + typedef struct CUDA_RESOURCE_DESC_st + { + CUresourcetype resType; + + union { + struct { + CUarray hArray; + } array; + struct { + CUmipmappedArray hMipmappedArray; + } mipmap; + struct { + CUdeviceptr devPtr; + CUarray_format format; + unsigned int numChannels; + size_t sizeInBytes; + } linear; + struct { + CUdeviceptr devPtr; + CUarray_format format; + unsigned int numChannels; + size_t width; + size_t height; + size_t pitchInBytes; + } pitch2D; + } res; + + unsigned int flags; + } CUDA_RESOURCE_DESC; + + * \endcode + * where: + * - ::CUDA_RESOURCE_DESC::resType specifies the type of resource to texture from. + * CUresourceType is defined as: + * \code + typedef enum CUresourcetype_enum { + CU_RESOURCE_TYPE_ARRAY = 0x00, + CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, + CU_RESOURCE_TYPE_LINEAR = 0x02, + CU_RESOURCE_TYPE_PITCH2D = 0x03 + } CUresourcetype; + * \endcode + * + * \par + * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_ARRAY, ::CUDA_RESOURCE_DESC::res::array::hArray + * must be set to a valid CUDA array handle. + * + * \par + * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY, ::CUDA_RESOURCE_DESC::res::mipmap::hMipmappedArray + * must be set to a valid CUDA mipmapped array handle. + * + * \par + * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_LINEAR, ::CUDA_RESOURCE_DESC::res::linear::devPtr + * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. + * ::CUDA_RESOURCE_DESC::res::linear::format and ::CUDA_RESOURCE_DESC::res::linear::numChannels + * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::linear::sizeInBytes + * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. The number of elements is computed as (sizeInBytes / (sizeof(format) * numChannels)). + * + * \par + * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_PITCH2D, ::CUDA_RESOURCE_DESC::res::pitch2D::devPtr + * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. + * ::CUDA_RESOURCE_DESC::res::pitch2D::format and ::CUDA_RESOURCE_DESC::res::pitch2D::numChannels + * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::pitch2D::width + * and ::CUDA_RESOURCE_DESC::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively. + * ::CUDA_RESOURCE_DESC::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to + * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. Pitch cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH. + * + * - ::flags must be set to zero. + * + * + * The ::CUDA_TEXTURE_DESC struct is defined as + * \code + typedef struct CUDA_TEXTURE_DESC_st { + CUaddress_mode addressMode[3]; + CUfilter_mode filterMode; + unsigned int flags; + unsigned int maxAnisotropy; + CUfilter_mode mipmapFilterMode; + float mipmapLevelBias; + float minMipmapLevelClamp; + float maxMipmapLevelClamp; + } CUDA_TEXTURE_DESC; + * \endcode + * where + * - ::CUDA_TEXTURE_DESC::addressMode specifies the addressing mode for each dimension of the texture data. ::CUaddress_mode is defined as: + * \code + typedef enum CUaddress_mode_enum { + CU_TR_ADDRESS_MODE_WRAP = 0, + CU_TR_ADDRESS_MODE_CLAMP = 1, + CU_TR_ADDRESS_MODE_MIRROR = 2, + CU_TR_ADDRESS_MODE_BORDER = 3 + } CUaddress_mode; + * \endcode + * This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES + * is not set, the only supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP. + * + * - ::CUDA_TEXTURE_DESC::filterMode specifies the filtering mode to be used when fetching from the texture. CUfilter_mode is defined as: + * \code + typedef enum CUfilter_mode_enum { + CU_TR_FILTER_MODE_POINT = 0, + CU_TR_FILTER_MODE_LINEAR = 1 + } CUfilter_mode; + * \endcode + * This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. + * + * - ::CUDA_TEXTURE_DESC::flags can be any combination of the following: + * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of having the texture promote integer data to floating point data in the + * range [0, 1]. Note that texture with 32-bit integer format would not be promoted, regardless of whether or not this flag is specified. + * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default behavior of having the texture coordinates range from [0, Dim) where Dim is + * the width or height of the CUDA array. Instead, the texture coordinates [0, 1.0) reference the entire breadth of the array dimension; Note + * that for CUDA mipmapped arrays, this flag has to be set. + * + * - ::CUDA_TEXTURE_DESC::maxAnisotropy specifies the maximum anisotropy ratio to be used when doing anisotropic filtering. This value will be + * clamped to the range [1,16]. + * + * - ::CUDA_TEXTURE_DESC::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels. + * + * - ::CUDA_TEXTURE_DESC::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level. + * + * - ::CUDA_TEXTURE_DESC::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to. + * + * - ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to. + * + * + * The ::CUDA_RESOURCE_VIEW_DESC struct is defined as + * \code + typedef struct CUDA_RESOURCE_VIEW_DESC_st + { + CUresourceViewFormat format; + size_t width; + size_t height; + size_t depth; + unsigned int firstMipmapLevel; + unsigned int lastMipmapLevel; + unsigned int firstLayer; + unsigned int lastLayer; + } CUDA_RESOURCE_VIEW_DESC; + * \endcode + * where: + * - ::CUDA_RESOURCE_VIEW_DESC::format specifies how the data contained in the CUDA array or CUDA mipmapped array should + * be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block + * compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a base of format ::CU_AD_FORMAT_UNSIGNED_INT32. + * with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have + * a format of ::CU_AD_FORMAT_UNSIGNED_INT32 with 2 channels. The other BC formats require the underlying resource to have the same base + * format but with 4 channels. + * + * - ::CUDA_RESOURCE_VIEW_DESC::width specifies the new width of the texture data. If the resource view format is a block + * compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats, + * this value has to be equal to that of the original resource. + * + * - ::CUDA_RESOURCE_VIEW_DESC::height specifies the new height of the texture data. If the resource view format is a block + * compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats, + * this value has to be equal to that of the original resource. + * + * - ::CUDA_RESOURCE_VIEW_DESC::depth specifies the new depth of the texture data. This value has to be equal to that of the + * original resource. + * + * - ::CUDA_RESOURCE_VIEW_DESC::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero. + * For non-mipmapped resources, this value has to be zero.::CUDA_TEXTURE_DESC::minMipmapLevelClamp and ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp + * will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified, + * then the actual minimum mipmap level clamp will be 3.2. + * + * - ::CUDA_RESOURCE_VIEW_DESC::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value + * has to be zero. + * + * - ::CUDA_RESOURCE_VIEW_DESC::firstLayer specifies the first layer index for layered textures. This will be the new layer zero. + * For non-layered resources, this value has to be zero. + * + * - ::CUDA_RESOURCE_VIEW_DESC::lastLayer specifies the last layer index for layered textures. For non-layered resources, + * this value has to be zero. + * + * + * \param pTexObject - Texture object to create + * \param pResDesc - Resource descriptor + * \param pTexDesc - Texture descriptor + * \param pResViewDesc - Resource view descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexObjectDestroy, + * ::cudaCreateTextureObject + */ +CUresult CUDAAPI cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc); + +/** + * \brief Destroys a texture object + * + * Destroys the texture object specified by \p texObject. + * + * \param texObject - Texture object to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexObjectCreate, + * ::cudaDestroyTextureObject + */ +CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject); + +/** + * \brief Returns a texture object's resource descriptor + * + * Returns the resource descriptor for the texture object specified by \p texObject. + * + * \param pResDesc - Resource descriptor + * \param texObject - Texture object + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexObjectCreate, + * ::cudaGetTextureObjectResourceDesc, + */ +CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUtexObject texObject); + +/** + * \brief Returns a texture object's texture descriptor + * + * Returns the texture descriptor for the texture object specified by \p texObject. + * + * \param pTexDesc - Texture descriptor + * \param texObject - Texture object + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexObjectCreate, + * ::cudaGetTextureObjectTextureDesc + */ +CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc, CUtexObject texObject); + +/** + * \brief Returns a texture object's resource view descriptor + * + * Returns the resource view descriptor for the texture object specified by \p texObject. + * If no resource view was set for \p texObject, the ::CUDA_ERROR_INVALID_VALUE is returned. + * + * \param pResViewDesc - Resource view descriptor + * \param texObject - Texture object + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexObjectCreate, + * ::cudaGetTextureObjectResourceViewDesc + */ +CUresult CUDAAPI cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject); + +/** @} */ /* END CUDA_TEXOBJECT */ + +/** + * \defgroup CUDA_SURFOBJECT Surface Object Management + * + * ___MANBRIEF___ surface object management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the surface object management functions of the + * low-level CUDA driver application programming interface. The surface + * object API is only supported on devices of compute capability 3.0 or higher. + * + * @{ + */ + +/** + * \brief Creates a surface object + * + * Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes + * the data to perform surface load/stores on. ::CUDA_RESOURCE_DESC::resType must be + * ::CU_RESOURCE_TYPE_ARRAY and ::CUDA_RESOURCE_DESC::res::array::hArray + * must be set to a valid CUDA array handle. ::CUDA_RESOURCE_DESC::flags must be set to zero. + * + * Surface objects are only supported on devices of compute capability 3.0 or higher. + * Additionally, a surface object is an opaque value, and, as such, should only be + * accessed through CUDA API calls. + * + * \param pSurfObject - Surface object to create + * \param pResDesc - Resource descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuSurfObjectDestroy, + * ::cudaCreateSurfaceObject + */ +CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject, const CUDA_RESOURCE_DESC *pResDesc); + +/** + * \brief Destroys a surface object + * + * Destroys the surface object specified by \p surfObject. + * + * \param surfObject - Surface object to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuSurfObjectCreate, + * ::cudaDestroySurfaceObject + */ +CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject); + +/** + * \brief Returns a surface object's resource descriptor + * + * Returns the resource descriptor for the surface object specified by \p surfObject. + * + * \param pResDesc - Resource descriptor + * \param surfObject - Surface object + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuSurfObjectCreate, + * ::cudaGetSurfaceObjectResourceDesc + */ +CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsurfObject surfObject); + +/** @} */ /* END CUDA_SURFOBJECT */ +#endif /* __CUDA_API_VERSION >= 5000 */ + +/** + * \defgroup CUDA_PEER_ACCESS Peer Context Memory Access + * + * ___MANBRIEF___ direct peer context memory access functions of the low-level + * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the direct peer context memory access functions + * of the low-level CUDA driver application programming interface. + * + * @{ + */ + +#if __CUDA_API_VERSION >= 4000 + +/** + * \brief Queries if a device may directly access a peer device's memory. + * + * Returns in \p *canAccessPeer a value of 1 if contexts on \p dev are capable of + * directly accessing memory from contexts on \p peerDev and 0 otherwise. + * If direct access of \p peerDev from \p dev is possible, then access may be + * enabled on two specific contexts by calling ::cuCtxEnablePeerAccess(). + * + * \param canAccessPeer - Returned access capability + * \param dev - Device from which allocations on \p peerDev are to + * be directly accessed. + * \param peerDev - Device on which the allocations to be directly accessed + * by \p dev reside. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuCtxEnablePeerAccess, + * ::cuCtxDisablePeerAccess, + * ::cudaDeviceCanAccessPeer + */ +CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev); + +/** + * \brief Enables direct access to memory allocations in a peer context. + * + * If both the current context and \p peerContext are on devices which support unified + * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING) and same + * major compute capability, then on success all allocations from \p peerContext will + * immediately be accessible by the current context. See \ref CUDA_UNIFIED for additional + * details. + * + * Note that access granted by this call is unidirectional and that in order to access + * memory from the current context in \p peerContext, a separate symmetric call + * to ::cuCtxEnablePeerAccess() is required. + * + * There is a system-wide maximum of eight peer connections per device. + * + * Returns ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED if ::cuDeviceCanAccessPeer() indicates + * that the ::CUdevice of the current context cannot directly access memory + * from the ::CUdevice of \p peerContext. + * + * Returns ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED if direct access of + * \p peerContext from the current context has already been enabled. + * + * Returns ::CUDA_ERROR_TOO_MANY_PEERS if direct peer access is not possible + * because hardware resources required for peer access have been exhausted. + * + * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, \p peerContext + * is not a valid context, or if the current context is \p peerContext. + * + * Returns ::CUDA_ERROR_INVALID_VALUE if \p Flags is not 0. + * + * \param peerContext - Peer context to enable direct access to from the current context + * \param Flags - Reserved for future use and must be set to 0 + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, + * ::CUDA_ERROR_TOO_MANY_PEERS, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cuDeviceCanAccessPeer, + * ::cuCtxDisablePeerAccess, + * ::cudaDeviceEnablePeerAccess + */ +CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags); + +/** + * \brief Disables direct access to memory allocations in a peer context and + * unregisters any registered allocations. + * + Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has + * not yet been enabled from \p peerContext to the current context. + * + * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, or if + * \p peerContext is not a valid context. + * + * \param peerContext - Peer context to disable direct access to + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * \notefnerr + * + * \sa + * ::cuDeviceCanAccessPeer, + * ::cuCtxEnablePeerAccess, + * ::cudaDeviceDisablePeerAccess + */ +CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext); + +#endif /* __CUDA_API_VERSION >= 4000 */ + +#if __CUDA_API_VERSION >= 8000 + +/** + * \brief Queries attributes of the link between two devices. + * + * Returns in \p *value the value of the requested attribute \p attrib of the + * link between \p srcDevice and \p dstDevice. The supported attributes are: + * - ::CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK: A relative value indicating the + * performance of the link between two devices. + * - ::CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED P2P: 1 if P2P Access is enable. + * - ::CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED: 1 if Atomic operations over + * the link are supported. + * - ::CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED: 1 if cudaArray can + * be accessed over the link. + * + * Returns ::CUDA_ERROR_INVALID_DEVICE if \p srcDevice or \p dstDevice are not valid + * or if they represent the same device. + * + * Returns ::CUDA_ERROR_INVALID_VALUE if \p attrib is not valid or if \p value is + * a null pointer. + * + * \param value - Returned value of the requested attribute + * \param attrib - The requested attribute of the link between \p srcDevice and \p dstDevice. + * \param srcDevice - The source device of the target link. + * \param dstDevice - The destination device of the target link. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cuCtxEnablePeerAccess, + * ::cuCtxDisablePeerAccess, + * ::cuDeviceCanAccessPeer, + * ::cudaDeviceGetP2PAttribute + */ +CUresult CUDAAPI cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice); + +#endif /* __CUDA_API_VERSION >= 8000 */ + +/** @} */ /* END CUDA_PEER_ACCESS */ + +/** + * \defgroup CUDA_GRAPHICS Graphics Interoperability + * + * ___MANBRIEF___ graphics interoperability functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the graphics interoperability functions of the + * low-level CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Unregisters a graphics resource for access by CUDA + * + * Unregisters the graphics resource \p resource so it is not accessible by + * CUDA unless registered again. + * + * If \p resource is invalid then ::CUDA_ERROR_INVALID_HANDLE is + * returned. + * + * \param resource - Resource to unregister + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuGraphicsD3D9RegisterResource, + * ::cuGraphicsD3D10RegisterResource, + * ::cuGraphicsD3D11RegisterResource, + * ::cuGraphicsGLRegisterBuffer, + * ::cuGraphicsGLRegisterImage, + * ::cudaGraphicsUnregisterResource + */ +CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource); + +/** + * \brief Get an array through which to access a subresource of a mapped graphics resource. + * + * Returns in \p *pArray an array through which the subresource of the mapped + * graphics resource \p resource which corresponds to array index \p arrayIndex + * and mipmap level \p mipLevel may be accessed. The value set in \p *pArray may + * change every time that \p resource is mapped. + * + * If \p resource is not a texture then it cannot be accessed via an array and + * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned. + * If \p arrayIndex is not a valid array index for \p resource then + * ::CUDA_ERROR_INVALID_VALUE is returned. + * If \p mipLevel is not a valid mipmap level for \p resource then + * ::CUDA_ERROR_INVALID_VALUE is returned. + * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. + * + * \param pArray - Returned array through which a subresource of \p resource may be accessed + * \param resource - Mapped resource to access + * \param arrayIndex - Array index for array textures or cubemap face + * index as defined by ::CUarray_cubemap_face for + * cubemap textures for the subresource to access + * \param mipLevel - Mipmap level for the subresource to access + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED, + * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY + * \notefnerr + * + * \sa + * ::cuGraphicsResourceGetMappedPointer, + * ::cudaGraphicsSubResourceGetMappedArray + */ +CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel); + +#if __CUDA_API_VERSION >= 5000 + +/** + * \brief Get a mipmapped array through which to access a mapped graphics resource. + * + * Returns in \p *pMipmappedArray a mipmapped array through which the mapped graphics + * resource \p resource. The value set in \p *pMipmappedArray may change every time + * that \p resource is mapped. + * + * If \p resource is not a texture then it cannot be accessed via a mipmapped array and + * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned. + * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. + * + * \param pMipmappedArray - Returned mipmapped array through which \p resource may be accessed + * \param resource - Mapped resource to access + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED, + * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY + * \notefnerr + * + * \sa + * ::cuGraphicsResourceGetMappedPointer, + * ::cudaGraphicsResourceGetMappedMipmappedArray + */ +CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource); + +#endif /* __CUDA_API_VERSION >= 5000 */ + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Get a device pointer through which to access a mapped graphics resource. + * + * Returns in \p *pDevPtr a pointer through which the mapped graphics resource + * \p resource may be accessed. + * Returns in \p pSize the size of the memory in bytes which may be accessed from that pointer. + * The value set in \p pPointer may change every time that \p resource is mapped. + * + * If \p resource is not a buffer then it cannot be accessed via a pointer and + * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER is returned. + * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. + * * + * \param pDevPtr - Returned pointer through which \p resource may be accessed + * \param pSize - Returned size of the buffer accessible starting at \p *pPointer + * \param resource - Mapped resource to access + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED, + * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER + * \notefnerr + * + * \sa + * ::cuGraphicsMapResources, + * ::cuGraphicsSubResourceGetMappedArray, + * ::cudaGraphicsResourceGetMappedPointer + */ +CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource); +#endif /* __CUDA_API_VERSION >= 3020 */ + +/** + * \brief Set usage flags for mapping a graphics resource + * + * Set \p flags for mapping the graphics resource \p resource. + * + * Changes to \p flags will take effect the next time \p resource is mapped. + * The \p flags argument may be any of the following: + + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this + * resource will be used. It is therefore assumed that this resource will be + * read from and written to by CUDA kernels. This is the default value. + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which + * access this resource will not write to this resource. + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels + * which access this resource will not read from this resource and will + * write over the entire contents of the resource, so none of the data + * previously stored in the resource will be preserved. + * + * If \p resource is presently mapped for access by CUDA then + * ::CUDA_ERROR_ALREADY_MAPPED is returned. + * If \p flags is not one of the above values then ::CUDA_ERROR_INVALID_VALUE is returned. + * + * \param resource - Registered resource to set flags for + * \param flags - Parameters for resource mapping + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ALREADY_MAPPED + * \notefnerr + * + * \sa + * ::cuGraphicsMapResources, + * ::cudaGraphicsResourceSetMapFlags + */ +CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags); + +/** + * \brief Map graphics resources for access by CUDA + * + * Maps the \p count graphics resources in \p resources for access by CUDA. + * + * The resources in \p resources may be accessed by CUDA until they + * are unmapped. The graphics API from which \p resources were registered + * should not access any resources while they are mapped by CUDA. If an + * application does so, the results are undefined. + * + * This function provides the synchronization guarantee that any graphics calls + * issued before ::cuGraphicsMapResources() will complete before any subsequent CUDA + * work issued in \p stream begins. + * + * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned. + * If any of \p resources are presently mapped for access by CUDA then ::CUDA_ERROR_ALREADY_MAPPED is returned. + * + * \param count - Number of resources to map + * \param resources - Resources to map for CUDA usage + * \param hStream - Stream with which to synchronize + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ALREADY_MAPPED, + * ::CUDA_ERROR_UNKNOWN + * \note_null_stream + * \notefnerr + * + * \sa + * ::cuGraphicsResourceGetMappedPointer, + * ::cuGraphicsSubResourceGetMappedArray, + * ::cuGraphicsUnmapResources, + * ::cudaGraphicsMapResources + */ +CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); + +/** + * \brief Unmap graphics resources. + * + * Unmaps the \p count graphics resources in \p resources. + * + * Once unmapped, the resources in \p resources may not be accessed by CUDA + * until they are mapped again. + * + * This function provides the synchronization guarantee that any CUDA work issued + * in \p stream before ::cuGraphicsUnmapResources() will complete before any + * subsequently issued graphics work begins. + * + * + * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned. + * If any of \p resources are not presently mapped for access by CUDA then ::CUDA_ERROR_NOT_MAPPED is returned. + * + * \param count - Number of resources to unmap + * \param resources - Resources to unmap + * \param hStream - Stream with which to synchronize + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED, + * ::CUDA_ERROR_UNKNOWN + * \note_null_stream + * \notefnerr + * + * \sa + * ::cuGraphicsMapResources, + * ::cudaGraphicsUnmapResources + */ +CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); + +/** @} */ /* END CUDA_GRAPHICS */ + +CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId); + + +/** + * CUDA API versioning support + */ +#if defined(__CUDA_API_VERSION_INTERNAL) + #undef cuMemHostRegister + #undef cuGraphicsResourceSetMapFlags + #undef cuLinkCreate + #undef cuLinkAddData + #undef cuLinkAddFile + #undef cuDeviceTotalMem + #undef cuCtxCreate + #undef cuModuleGetGlobal + #undef cuMemGetInfo + #undef cuMemAlloc + #undef cuMemAllocPitch + #undef cuMemFree + #undef cuMemGetAddressRange + #undef cuMemAllocHost + #undef cuMemHostGetDevicePointer + #undef cuMemcpyHtoD + #undef cuMemcpyDtoH + #undef cuMemcpyDtoD + #undef cuMemcpyDtoA + #undef cuMemcpyAtoD + #undef cuMemcpyHtoA + #undef cuMemcpyAtoH + #undef cuMemcpyAtoA + #undef cuMemcpyHtoAAsync + #undef cuMemcpyAtoHAsync + #undef cuMemcpy2D + #undef cuMemcpy2DUnaligned + #undef cuMemcpy3D + #undef cuMemcpyHtoDAsync + #undef cuMemcpyDtoHAsync + #undef cuMemcpyDtoDAsync + #undef cuMemcpy2DAsync + #undef cuMemcpy3DAsync + #undef cuMemsetD8 + #undef cuMemsetD16 + #undef cuMemsetD32 + #undef cuMemsetD2D8 + #undef cuMemsetD2D16 + #undef cuMemsetD2D32 + #undef cuArrayCreate + #undef cuArrayGetDescriptor + #undef cuArray3DCreate + #undef cuArray3DGetDescriptor + #undef cuTexRefSetAddress + #undef cuTexRefSetAddress2D + #undef cuTexRefGetAddress + #undef cuGraphicsResourceGetMappedPointer + #undef cuCtxDestroy + #undef cuCtxPopCurrent + #undef cuCtxPushCurrent + #undef cuStreamDestroy + #undef cuEventDestroy + #undef cuMemcpy + #undef cuMemcpyAsync + #undef cuMemcpyPeer + #undef cuMemcpyPeerAsync + #undef cuMemcpy3DPeer + #undef cuMemcpy3DPeerAsync + #undef cuMemsetD8Async + #undef cuMemsetD16Async + #undef cuMemsetD32Async + #undef cuMemsetD2D8Async + #undef cuMemsetD2D16Async + #undef cuMemsetD2D32Async + #undef cuStreamGetPriority + #undef cuStreamGetFlags + #undef cuStreamGetCtx + #undef cuStreamWaitEvent + #undef cuStreamAddCallback + #undef cuStreamAttachMemAsync + #undef cuStreamQuery + #undef cuStreamSynchronize + #undef cuEventRecord + #undef cuLaunchKernel + #undef cuLaunchHostFunc + #undef cuGraphicsMapResources + #undef cuGraphicsUnmapResources + #undef cuStreamWriteValue32 + #undef cuStreamWaitValue32 + #undef cuStreamWriteValue64 + #undef cuStreamWaitValue64 + #undef cuStreamBatchMemOp + #undef cuMemPrefetchAsync + #undef cuLaunchCooperativeKernel + #undef cuSignalExternalSemaphoresAsync + #undef cuWaitExternalSemaphoresAsync + #undef cuStreamBeginCapture + #undef cuStreamEndCapture + #undef cuStreamIsCapturing + #undef cuStreamGetCaptureInfo + #undef cuGraphLaunch +#endif /* __CUDA_API_VERSION_INTERNAL */ + +#if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 4000 && __CUDA_API_VERSION < 6050) +CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags); +#endif /* defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 4000 && __CUDA_API_VERSION < 6050) */ + +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 6050 +CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags); +#endif /* defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 6050 */ + +#if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 5050 && __CUDA_API_VERSION < 6050) +CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut); +CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, + unsigned int numOptions, CUjit_option *options, void **optionValues); +CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path, + unsigned int numOptions, CUjit_option *options, void **optionValues); +#endif /* __CUDA_API_VERSION_INTERNAL || (__CUDA_API_VERSION >= 5050 && __CUDA_API_VERSION < 6050) */ + +#if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010) +CUresult CUDAAPI cuTexRefSetAddress2D_v2(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); +#endif /* __CUDA_API_VERSION_INTERNAL || (__CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010) */ + +/** + * CUDA API made obselete at API version 3020 + */ +#if defined(__CUDA_API_VERSION_INTERNAL) + #define CUdeviceptr CUdeviceptr_v1 + #define CUDA_MEMCPY2D_st CUDA_MEMCPY2D_v1_st + #define CUDA_MEMCPY2D CUDA_MEMCPY2D_v1 + #define CUDA_MEMCPY3D_st CUDA_MEMCPY3D_v1_st + #define CUDA_MEMCPY3D CUDA_MEMCPY3D_v1 + #define CUDA_ARRAY_DESCRIPTOR_st CUDA_ARRAY_DESCRIPTOR_v1_st + #define CUDA_ARRAY_DESCRIPTOR CUDA_ARRAY_DESCRIPTOR_v1 + #define CUDA_ARRAY3D_DESCRIPTOR_st CUDA_ARRAY3D_DESCRIPTOR_v1_st + #define CUDA_ARRAY3D_DESCRIPTOR CUDA_ARRAY3D_DESCRIPTOR_v1 +#endif /* CUDA_FORCE_LEGACY32_INTERNAL */ + +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 3020 + +typedef unsigned int CUdeviceptr; + +typedef struct CUDA_MEMCPY2D_st +{ + unsigned int srcXInBytes; /**< Source X in bytes */ + unsigned int srcY; /**< Source Y */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ + + unsigned int dstXInBytes; /**< Destination X in bytes */ + unsigned int dstY; /**< Destination Y */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ + + unsigned int WidthInBytes; /**< Width of 2D memory copy in bytes */ + unsigned int Height; /**< Height of 2D memory copy */ +} CUDA_MEMCPY2D; + +typedef struct CUDA_MEMCPY3D_st +{ + unsigned int srcXInBytes; /**< Source X in bytes */ + unsigned int srcY; /**< Source Y */ + unsigned int srcZ; /**< Source Z */ + unsigned int srcLOD; /**< Source LOD */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + void *reserved0; /**< Must be NULL */ + unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ + unsigned int srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ + + unsigned int dstXInBytes; /**< Destination X in bytes */ + unsigned int dstY; /**< Destination Y */ + unsigned int dstZ; /**< Destination Z */ + unsigned int dstLOD; /**< Destination LOD */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + void *reserved1; /**< Must be NULL */ + unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ + unsigned int dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ + + unsigned int WidthInBytes; /**< Width of 3D memory copy in bytes */ + unsigned int Height; /**< Height of 3D memory copy */ + unsigned int Depth; /**< Depth of 3D memory copy */ +} CUDA_MEMCPY3D; + +typedef struct CUDA_ARRAY_DESCRIPTOR_st +{ + unsigned int Width; /**< Width of array */ + unsigned int Height; /**< Height of array */ + + CUarray_format Format; /**< Array format */ + unsigned int NumChannels; /**< Channels per array element */ +} CUDA_ARRAY_DESCRIPTOR; + +typedef struct CUDA_ARRAY3D_DESCRIPTOR_st +{ + unsigned int Width; /**< Width of 3D array */ + unsigned int Height; /**< Height of 3D array */ + unsigned int Depth; /**< Depth of 3D array */ + + CUarray_format Format; /**< Array format */ + unsigned int NumChannels; /**< Channels per array element */ + unsigned int Flags; /**< Flags */ +} CUDA_ARRAY3D_DESCRIPTOR; + +CUresult CUDAAPI cuDeviceTotalMem(unsigned int *bytes, CUdevice dev); +CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); +CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *bytes, CUmodule hmod, const char *name); +CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total); +CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, unsigned int bytesize); +CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes); +CUresult CUDAAPI cuMemFree(CUdeviceptr dptr); +CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, unsigned int *psize, CUdeviceptr dptr); +CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize); +CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags); +CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount); +CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount); +CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount); +CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr srcDevice, unsigned int ByteCount); +CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); +CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount); +CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); +CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); +CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream); +CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream); +CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy); +CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy); +CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy); +CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream); +CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream); +CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream); +CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream); +CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream); +CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, unsigned int N); +CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, unsigned int N); +CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, unsigned int N); +CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height); +CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height); +CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height); +CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray); +CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray); +CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray); +CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray); +CUresult CUDAAPI cuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, unsigned int bytes); +CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, unsigned int Pitch); +CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef); +CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, unsigned int *pSize, CUgraphicsResource resource); +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION < 3020 */ +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 4000 +CUresult CUDAAPI cuCtxDestroy(CUcontext ctx); +CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx); +CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx); +CUresult CUDAAPI cuStreamDestroy(CUstream hStream); +CUresult CUDAAPI cuEventDestroy(CUevent hEvent); +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION < 4000 */ +#if defined(__CUDA_API_VERSION_INTERNAL) + #undef CUdeviceptr + #undef CUDA_MEMCPY2D_st + #undef CUDA_MEMCPY2D + #undef CUDA_MEMCPY3D_st + #undef CUDA_MEMCPY3D + #undef CUDA_ARRAY_DESCRIPTOR_st + #undef CUDA_ARRAY_DESCRIPTOR + #undef CUDA_ARRAY3D_DESCRIPTOR_st + #undef CUDA_ARRAY3D_DESCRIPTOR +#endif /* __CUDA_API_VERSION_INTERNAL */ + +#if defined(__CUDA_API_VERSION_INTERNAL) + CUresult CUDAAPI cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); + CUresult CUDAAPI cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); + CUresult CUDAAPI cuMemcpyDtoD_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); + CUresult CUDAAPI cuMemcpyDtoA_v2(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount); + CUresult CUDAAPI cuMemcpyAtoD_v2(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount); + CUresult CUDAAPI cuMemcpyHtoA_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount); + CUresult CUDAAPI cuMemcpyAtoH_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount); + CUresult CUDAAPI cuMemcpyAtoA_v2(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount); + CUresult CUDAAPI cuMemcpyHtoAAsync_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyAtoHAsync_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpy2D_v2(const CUDA_MEMCPY2D *pCopy); + CUresult CUDAAPI cuMemcpy2DUnaligned_v2(const CUDA_MEMCPY2D *pCopy); + CUresult CUDAAPI cuMemcpy3D_v2(const CUDA_MEMCPY3D *pCopy); + CUresult CUDAAPI cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyDtoDAsync_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpy2DAsync_v2(const CUDA_MEMCPY2D *pCopy, CUstream hStream); + CUresult CUDAAPI cuMemcpy3DAsync_v2(const CUDA_MEMCPY3D *pCopy, CUstream hStream); + CUresult CUDAAPI cuMemsetD8_v2(CUdeviceptr dstDevice, unsigned char uc, size_t N); + CUresult CUDAAPI cuMemsetD16_v2(CUdeviceptr dstDevice, unsigned short us, size_t N); + CUresult CUDAAPI cuMemsetD32_v2(CUdeviceptr dstDevice, unsigned int ui, size_t N); + CUresult CUDAAPI cuMemsetD2D8_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height); + CUresult CUDAAPI cuMemsetD2D16_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height); + CUresult CUDAAPI cuMemsetD2D32_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height); + CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); + CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount); + CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy); + CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream); + + CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream); + CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream); + CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream); + CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream); + CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream); + CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream); + + CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority); + CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags); + CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx); + CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags); + CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags); + CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags); + CUresult CUDAAPI cuStreamQuery(CUstream hStream); + CUresult CUDAAPI cuStreamSynchronize(CUstream hStream); + CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream); + CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra); + CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData); + CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); + CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); + CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); + CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); + CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); + CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); + CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); + CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream); + CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams); + CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); + CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); + CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream); + CUresult CUDAAPI cuStreamBeginCapture_ptsz(CUstream hStream); + CUresult CUDAAPI cuStreamBeginCapture_v2(CUstream hStream, CUstreamCaptureMode mode); + CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph); + CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus); + CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus, cuuint64_t *id); + CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraph, CUstream hStream); +#endif + +#ifdef __cplusplus +} +#endif + +#undef __CUDA_API_VERSION +#undef __CUDA_DEPRECATED + +#endif /* __cuda_cuda_h__ */ diff --git a/icicle/curves/bn254/cuda_runtime.h b/icicle/curves/bn254/cuda_runtime.h new file mode 100644 index 000000000..b909b4dee --- /dev/null +++ b/icicle/curves/bn254/cuda_runtime.h @@ -0,0 +1,2039 @@ +/* + * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_RUNTIME_H__) +#define __CUDA_RUNTIME_H__ + +#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__) +#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_H__ +#endif + +#if !defined(__CUDACC_RTC__) +#if defined(__GNUC__) +#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))) +#pragma GCC diagnostic push +#endif +#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))) +#pragma GCC diagnostic ignored "-Wunused-function" +#endif +#elif defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable: 4820) +#endif +#endif + +#ifdef __QNX__ +#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 7) +typedef unsigned size_t; +#endif +#endif +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "crt/host_config.h" + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "builtin_types.h" +#include "library_types.h" +#if !defined(__CUDACC_RTC__) +#define EXCLUDE_FROM_RTC +#include "channel_descriptor.h" +#include "cuda_runtime_api.h" +#include "driver_functions.h" +#undef EXCLUDE_FROM_RTC +#endif /* !__CUDACC_RTC__ */ +#include "crt/host_defines.h" +#include "vector_functions.h" + +#if defined(__CUDACC__) + +#if defined(__CUDACC_RTC__) +#include "nvrtc_device_runtime.h" +#include "crt/device_functions.h" +#include "crt/common_functions.h" +#include "cuda_surface_types.h" +#include "cuda_texture_types.h" +#include "device_launch_parameters.h" + +#else /* !__CUDACC_RTC__ */ +#define EXCLUDE_FROM_RTC +#include "crt/common_functions.h" +#include "cuda_surface_types.h" +#include "cuda_texture_types.h" +#include "crt/device_functions.h" +#include "device_launch_parameters.h" + +#if defined(__CUDACC_EXTENDED_LAMBDA__) +#include +#include +struct __device_builtin__ __nv_lambda_preheader_injection { }; +#endif /* defined(__CUDACC_EXTENDED_LAMBDA__) */ + +#undef EXCLUDE_FROM_RTC +#endif /* __CUDACC_RTC__ */ + +#endif /* __CUDACC__ */ + +/** \cond impl_private */ +#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED) +#define __CUDA_DEPRECATED +#elif defined(_MSC_VER) +#define __CUDA_DEPRECATED __declspec(deprecated) +#elif defined(__GNUC__) +#define __CUDA_DEPRECATED __attribute__((deprecated)) +#else +#define __CUDA_DEPRECATED +#endif +/** \endcond impl_private */ + +#if defined(__cplusplus) && !defined(__CUDACC_RTC__) + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +/** + * \addtogroup CUDART_HIGHLEVEL + * @{ + */ + +/** + *\brief Launches a device function + * + * The function invokes kernel \p func on \p gridDim (\p gridDim.x × \p gridDim.y + * × \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x × + * \p blockDim.y × \p blockDim.z) threads. + * + * If the kernel has N parameters the \p args should point to array of N pointers. + * Each pointer, from args[0] to args[N - 1], point to the region + * of memory from which the actual parameter will be copied. + * + * \p sharedMem sets the amount of dynamic shared memory that will be available to + * each thread block. + * + * \p stream specifies a stream the invocation is associated to. + * + * \param func - Device function symbol + * \param gridDim - Grid dimensions + * \param blockDim - Block dimensions + * \param args - Arguments + * \param sharedMem - Shared memory (defaults to 0) + * \param stream - Stream identifier (defaults to NULL) + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidConfiguration, + * ::cudaErrorLaunchFailure, + * ::cudaErrorLaunchTimeout, + * ::cudaErrorLaunchOutOfResources, + * ::cudaErrorSharedObjectInitFailed, + * ::cudaErrorInvalidPtx, + * ::cudaErrorNoKernelImageForDevice, + * ::cudaErrorJitCompilerNotFound + * \notefnerr + * \note_async + * \note_null_stream + * \note_init_rt + * \note_callback + * + * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)" + */ +template +static __inline__ __host__ cudaError_t cudaLaunchKernel( + const T *func, + dim3 gridDim, + dim3 blockDim, + void **args, + size_t sharedMem = 0, + cudaStream_t stream = 0 +) +{ + return ::cudaLaunchKernel((const void *)func, gridDim, blockDim, args, sharedMem, stream); +} + +/** + *\brief Launches a device function + * + * The function invokes kernel \p func on \p gridDim (\p gridDim.x × \p gridDim.y + * × \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x × + * \p blockDim.y × \p blockDim.z) threads. + * + * The device on which this kernel is invoked must have a non-zero value for + * the device attribute ::cudaDevAttrCooperativeLaunch. + * + * The total number of blocks launched cannot exceed the maximum number of blocks per + * multiprocessor as returned by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor (or + * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors + * as specified by the device attribute ::cudaDevAttrMultiProcessorCount. + * + * The kernel cannot make use of CUDA dynamic parallelism. + * + * If the kernel has N parameters the \p args should point to array of N pointers. + * Each pointer, from args[0] to args[N - 1], point to the region + * of memory from which the actual parameter will be copied. + * + * \p sharedMem sets the amount of dynamic shared memory that will be available to + * each thread block. + * + * \p stream specifies a stream the invocation is associated to. + * + * \param func - Device function symbol + * \param gridDim - Grid dimensions + * \param blockDim - Block dimensions + * \param args - Arguments + * \param sharedMem - Shared memory (defaults to 0) + * \param stream - Stream identifier (defaults to NULL) + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidConfiguration, + * ::cudaErrorLaunchFailure, + * ::cudaErrorLaunchTimeout, + * ::cudaErrorLaunchOutOfResources, + * ::cudaErrorSharedObjectInitFailed + * \notefnerr + * \note_async + * \note_null_stream + * \note_init_rt + * \note_callback + * + * \ref ::cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchCooperativeKernel (C API)" + */ +template +static __inline__ __host__ cudaError_t cudaLaunchCooperativeKernel( + const T *func, + dim3 gridDim, + dim3 blockDim, + void **args, + size_t sharedMem = 0, + cudaStream_t stream = 0 +) +{ + return ::cudaLaunchCooperativeKernel((const void *)func, gridDim, blockDim, args, sharedMem, stream); +} + +/** + * \brief \hl Creates an event object with the specified flags + * + * Creates an event object with the specified flags. Valid flags include: + * - ::cudaEventDefault: Default event creation flag. + * - ::cudaEventBlockingSync: Specifies that event should use blocking + * synchronization. A host thread that uses ::cudaEventSynchronize() to wait + * on an event created with this flag will block until the event actually + * completes. + * - ::cudaEventDisableTiming: Specifies that the created event does not need + * to record timing data. Events created with this flag specified and + * the ::cudaEventBlockingSync flag not specified will provide the best + * performance when used with ::cudaStreamWaitEvent() and ::cudaEventQuery(). + * + * \param event - Newly created event + * \param flags - Flags for new event + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorLaunchFailure, + * ::cudaErrorMemoryAllocation + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)", + * ::cudaEventCreateWithFlags, ::cudaEventRecord, ::cudaEventQuery, + * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime, + * ::cudaStreamWaitEvent + */ +static __inline__ __host__ cudaError_t cudaEventCreate( + cudaEvent_t *event, + unsigned int flags +) +{ + return ::cudaEventCreateWithFlags(event, flags); +} + +/** + * \brief \hl Allocates page-locked memory on the host + * + * Allocates \p size bytes of host memory that is page-locked and accessible + * to the device. The driver tracks the virtual memory ranges allocated with + * this function and automatically accelerates calls to functions such as + * ::cudaMemcpy(). Since the memory can be accessed directly by the device, it + * can be read or written with much higher bandwidth than pageable memory + * obtained with functions such as ::malloc(). Allocating excessive amounts of + * pinned memory may degrade system performance, since it reduces the amount + * of memory available to the system for paging. As a result, this function is + * best used sparingly to allocate staging areas for data exchange between host + * and device. + * + * The \p flags parameter enables different options to be specified that affect + * the allocation, as follows. + * - ::cudaHostAllocDefault: This flag's value is defined to be 0. + * - ::cudaHostAllocPortable: The memory returned by this call will be + * considered as pinned memory by all CUDA contexts, not just the one that + * performed the allocation. + * - ::cudaHostAllocMapped: Maps the allocation into the CUDA address space. + * The device pointer to the memory may be obtained by calling + * ::cudaHostGetDevicePointer(). + * - ::cudaHostAllocWriteCombined: Allocates the memory as write-combined (WC). + * WC memory can be transferred across the PCI Express bus more quickly on some + * system configurations, but cannot be read efficiently by most CPUs. WC + * memory is a good option for buffers that will be written by the CPU and read + * by the device via mapped pinned memory or host->device transfers. + * + * All of these flags are orthogonal to one another: a developer may allocate + * memory that is portable, mapped and/or write-combined with no restrictions. + * + * ::cudaSetDeviceFlags() must have been called with the ::cudaDeviceMapHost + * flag in order for the ::cudaHostAllocMapped flag to have any effect. + * + * The ::cudaHostAllocMapped flag may be specified on CUDA contexts for devices + * that do not support mapped pinned memory. The failure is deferred to + * ::cudaHostGetDevicePointer() because the memory may be mapped into other + * CUDA contexts via the ::cudaHostAllocPortable flag. + * + * Memory allocated by this function must be freed with ::cudaFreeHost(). + * + * \param ptr - Device pointer to allocated memory + * \param size - Requested allocation size in bytes + * \param flags - Requested properties of allocated memory + * + * \return + * ::cudaSuccess, + * ::cudaErrorMemoryAllocation + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaSetDeviceFlags, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaHostAlloc + */ +static __inline__ __host__ cudaError_t cudaMallocHost( + void **ptr, + size_t size, + unsigned int flags +) +{ + return ::cudaHostAlloc(ptr, size, flags); +} + +template +static __inline__ __host__ cudaError_t cudaHostAlloc( + T **ptr, + size_t size, + unsigned int flags +) +{ + return ::cudaHostAlloc((void**)(void*)ptr, size, flags); +} + +template +static __inline__ __host__ cudaError_t cudaHostGetDevicePointer( + T **pDevice, + void *pHost, + unsigned int flags +) +{ + return ::cudaHostGetDevicePointer((void**)(void*)pDevice, pHost, flags); +} + +/** + * \brief Allocates memory that will be automatically managed by the Unified Memory system + * + * Allocates \p size bytes of managed memory on the device and returns in + * \p *devPtr a pointer to the allocated memory. If the device doesn't support + * allocating managed memory, ::cudaErrorNotSupported is returned. Support + * for managed memory can be queried using the device attribute + * ::cudaDevAttrManagedMemory. The allocated memory is suitably + * aligned for any kind of variable. The memory is not cleared. If \p size + * is 0, ::cudaMallocManaged returns ::cudaErrorInvalidValue. The pointer + * is valid on the CPU and on all GPUs in the system that support managed memory. + * All accesses to this pointer must obey the Unified Memory programming model. + * + * \p flags specifies the default stream association for this allocation. + * \p flags must be one of ::cudaMemAttachGlobal or ::cudaMemAttachHost. The + * default value for \p flags is ::cudaMemAttachGlobal. + * If ::cudaMemAttachGlobal is specified, then this memory is accessible from + * any stream on any device. If ::cudaMemAttachHost is specified, then the + * allocation should not be accessed from devices that have a zero value for the + * device attribute ::cudaDevAttrConcurrentManagedAccess; an explicit call to + * ::cudaStreamAttachMemAsync will be required to enable access on such devices. + * + * If the association is later changed via ::cudaStreamAttachMemAsync to + * a single stream, the default association, as specified during ::cudaMallocManaged, + * is restored when that stream is destroyed. For __managed__ variables, the + * default association is always ::cudaMemAttachGlobal. Note that destroying a + * stream is an asynchronous operation, and as a result, the change to default + * association won't happen until all work in the stream has completed. + * + * Memory allocated with ::cudaMallocManaged should be released with ::cudaFree. + * + * Device memory oversubscription is possible for GPUs that have a non-zero value for the + * device attribute ::cudaDevAttrConcurrentManagedAccess. Managed memory on + * such GPUs may be evicted from device memory to host memory at any time by the Unified + * Memory driver in order to make room for other allocations. + * + * In a multi-GPU system where all GPUs have a non-zero value for the device attribute + * ::cudaDevAttrConcurrentManagedAccess, managed memory may not be populated when this + * API returns and instead may be populated on access. In such systems, managed memory can + * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to + * maintain data locality and prevent excessive page faults to the extent possible. The application + * can also guide the driver about memory usage patterns via ::cudaMemAdvise. The application + * can also explicitly migrate memory to a desired processor's memory via + * ::cudaMemPrefetchAsync. + * + * In a multi-GPU system where all of the GPUs have a zero value for the device attribute + * ::cudaDevAttrConcurrentManagedAccess and all the GPUs have peer-to-peer support + * with each other, the physical storage for managed memory is created on the GPU which is active + * at the time ::cudaMallocManaged is called. All other GPUs will reference the data at reduced + * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate + * memory among such GPUs. + * + * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and + * where the value of the device attribute ::cudaDevAttrConcurrentManagedAccess + * is zero for at least one of those GPUs, the location chosen for physical storage of managed + * memory is system-dependent. + * - On Linux, the location chosen will be device memory as long as the current set of active + * contexts are on devices that either have peer-to-peer support with each other or have a + * non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess. + * If there is an active context on a GPU that does not have a non-zero value for that device + * attribute and it does not have peer-to-peer support with the other devices that have active + * contexts on them, then the location for physical storage will be 'zero-copy' or host memory. + * Note that this means that managed memory that is located in device memory is migrated to + * host memory if a new context is created on a GPU that doesn't have a non-zero value for + * the device attribute and does not support peer-to-peer with at least one of the other devices + * that has an active context. This in turn implies that context creation may fail if there is + * insufficient host memory to migrate all managed allocations. + * - On Windows, the physical storage is always created in 'zero-copy' or host memory. + * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these + * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to + * restrict CUDA to only use those GPUs that have peer-to-peer support. + * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a non-zero + * value to force the driver to always use device memory for physical storage. + * When this environment variable is set to a non-zero value, all devices used in + * that process that support managed memory have to be peer-to-peer compatible + * with each other. The error ::cudaErrorInvalidDevice will be returned if a device + * that supports managed memory is used and it is not peer-to-peer compatible with + * any of the other managed memory supporting devices that were previously used in + * that process, even if ::cudaDeviceReset has been called on those devices. These + * environment variables are described in the CUDA programming guide under the + * "CUDA environment variables" section. + * - On ARM, managed memory is not available on discrete gpu with Drive PX-2. + * + * \param devPtr - Pointer to allocated device memory + * \param size - Requested allocation size in bytes + * \param flags - Must be either ::cudaMemAttachGlobal or ::cudaMemAttachHost (defaults to ::cudaMemAttachGlobal) + * + * \return + * ::cudaSuccess, + * ::cudaErrorMemoryAllocation, + * ::cudaErrorNotSupported, + * ::cudaErrorInvalidValue + * \note_init_rt + * \note_callback + * + * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray, + * ::cudaMalloc3D, ::cudaMalloc3DArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaHostAlloc, ::cudaDeviceGetAttribute, ::cudaStreamAttachMemAsync + */ +template +static __inline__ __host__ cudaError_t cudaMallocManaged( + T **devPtr, + size_t size, + unsigned int flags = cudaMemAttachGlobal +) +{ + return ::cudaMallocManaged((void**)(void*)devPtr, size, flags); +} + +/** + * \brief Attach memory to a stream asynchronously + * + * Enqueues an operation in \p stream to specify stream association of + * \p length bytes of memory starting from \p devPtr. This function is a + * stream-ordered operation, meaning that it is dependent on, and will + * only take effect when, previous work in stream has completed. Any + * previous association is automatically replaced. + * + * \p devPtr must point to an one of the following types of memories: + * - managed memory declared using the __managed__ keyword or allocated with + * ::cudaMallocManaged. + * - a valid host-accessible region of system-allocated pageable memory. This + * type of memory may only be specified if the device associated with the + * stream reports a non-zero value for the device attribute + * ::cudaDevAttrPageableMemoryAccess. + * + * For managed allocations, \p length must be either zero or the entire + * allocation's size. Both indicate that the entire allocation's stream + * association is being changed. Currently, it is not possible to change stream + * association for a portion of a managed allocation. + * + * For pageable allocations, \p length must be non-zero. + * + * The stream association is specified using \p flags which must be + * one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle. + * The default value for \p flags is ::cudaMemAttachSingle + * If the ::cudaMemAttachGlobal flag is specified, the memory can be accessed + * by any stream on any device. + * If the ::cudaMemAttachHost flag is specified, the program makes a guarantee + * that it won't access the memory on the device from any stream on a device that + * has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess. + * If the ::cudaMemAttachSingle flag is specified and \p stream is associated with + * a device that has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess, + * the program makes a guarantee that it will only access the memory on the device + * from \p stream. It is illegal to attach singly to the NULL stream, because the + * NULL stream is a virtual global stream and not a specific stream. An error will + * be returned in this case. + * + * When memory is associated with a single stream, the Unified Memory system will + * allow CPU access to this memory region so long as all operations in \p stream + * have completed, regardless of whether other streams are active. In effect, + * this constrains exclusive ownership of the managed memory region by + * an active GPU to per-stream activity instead of whole-GPU activity. + * + * Accessing memory on the device from streams that are not associated with + * it will produce undefined results. No error checking is performed by the + * Unified Memory system to ensure that kernels launched into other streams + * do not access this region. + * + * It is a program's responsibility to order calls to ::cudaStreamAttachMemAsync + * via events, synchronization or other means to ensure legal access to memory + * at all times. Data visibility and coherency will be changed appropriately + * for all kernels which follow a stream-association change. + * + * If \p stream is destroyed while data is associated with it, the association is + * removed and the association reverts to the default visibility of the allocation + * as specified at ::cudaMallocManaged. For __managed__ variables, the default + * association is always ::cudaMemAttachGlobal. Note that destroying a stream is an + * asynchronous operation, and as a result, the change to default association won't + * happen until all work in the stream has completed. + * + * \param stream - Stream in which to enqueue the attach operation + * \param devPtr - Pointer to memory (must be a pointer to managed memory or + * to a valid host-accessible region of system-allocated + * memory) + * \param length - Length of memory (defaults to zero) + * \param flags - Must be one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle (defaults to ::cudaMemAttachSingle) + * + * \return + * ::cudaSuccess, + * ::cudaErrorNotReady, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy, ::cudaMallocManaged + */ +template +static __inline__ __host__ cudaError_t cudaStreamAttachMemAsync( + cudaStream_t stream, + T *devPtr, + size_t length = 0, + unsigned int flags = cudaMemAttachSingle +) +{ + return ::cudaStreamAttachMemAsync(stream, (void*)devPtr, length, flags); +} + +template +static __inline__ __host__ cudaError_t cudaMalloc( + T **devPtr, + size_t size +) +{ + return ::cudaMalloc((void**)(void*)devPtr, size); +} + +template +static __inline__ __host__ cudaError_t cudaMallocHost( + T **ptr, + size_t size, + unsigned int flags = 0 +) +{ + return cudaMallocHost((void**)(void*)ptr, size, flags); +} + +template +static __inline__ __host__ cudaError_t cudaMallocPitch( + T **devPtr, + size_t *pitch, + size_t width, + size_t height +) +{ + return ::cudaMallocPitch((void**)(void*)devPtr, pitch, width, height); +} + +#if defined(__CUDACC__) + +/** + * \brief \hl Copies data to the given symbol on the device + * + * Copies \p count bytes from the memory area pointed to by \p src + * to the memory area \p offset bytes from the start of symbol + * \p symbol. The memory areas may not overlap. \p symbol is a variable that + * resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToDevice. + * + * \param symbol - Device symbol reference + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_sync + * \note_string_api_deprecation + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync + */ +template +static __inline__ __host__ cudaError_t cudaMemcpyToSymbol( + const T &symbol, + const void *src, + size_t count, + size_t offset = 0, + enum cudaMemcpyKind kind = cudaMemcpyHostToDevice +) +{ + return ::cudaMemcpyToSymbol((const void*)&symbol, src, count, offset, kind); +} + +/** + * \brief \hl Copies data to the given symbol on the device + * + * Copies \p count bytes from the memory area pointed to by \p src + * to the memory area \p offset bytes from the start of symbol + * \p symbol. The memory areas may not overlap. \p symbol is a variable that + * resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToDevice. + * + * ::cudaMemcpyToSymbolAsync() is asynchronous with respect to the host, so + * the call may return before the copy is complete. The copy can optionally + * be associated to a stream by passing a non-zero \p stream argument. If + * \p kind is ::cudaMemcpyHostToDevice and \p stream is non-zero, the copy + * may overlap with operations in other streams. + * + * \param symbol - Device symbol reference + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_async + * \note_string_api_deprecation + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyFromSymbolAsync + */ +template +static __inline__ __host__ cudaError_t cudaMemcpyToSymbolAsync( + const T &symbol, + const void *src, + size_t count, + size_t offset = 0, + enum cudaMemcpyKind kind = cudaMemcpyHostToDevice, + cudaStream_t stream = 0 +) +{ + return ::cudaMemcpyToSymbolAsync((const void*)&symbol, src, count, offset, kind, stream); +} + +/** + * \brief \hl Copies data from the given symbol on the device + * + * Copies \p count bytes from the memory area \p offset bytes + * from the start of symbol \p symbol to the memory area pointed to by \p dst. + * The memory areas may not overlap. \p symbol is a variable that + * resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyDeviceToHost or ::cudaMemcpyDeviceToDevice. + * + * \param dst - Destination memory address + * \param symbol - Device symbol reference + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_sync + * \note_string_api_deprecation + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync + */ +template +static __inline__ __host__ cudaError_t cudaMemcpyFromSymbol( + void *dst, + const T &symbol, + size_t count, + size_t offset = 0, + enum cudaMemcpyKind kind = cudaMemcpyDeviceToHost +) +{ + return ::cudaMemcpyFromSymbol(dst, (const void*)&symbol, count, offset, kind); +} + +/** + * \brief \hl Copies data from the given symbol on the device + * + * Copies \p count bytes from the memory area \p offset bytes + * from the start of symbol \p symbol to the memory area pointed to by \p dst. + * The memory areas may not overlap. \p symbol is a variable that resides in + * global or constant memory space. \p kind can be either + * ::cudaMemcpyDeviceToHost or ::cudaMemcpyDeviceToDevice. + * + * ::cudaMemcpyFromSymbolAsync() is asynchronous with respect to the host, so + * the call may return before the copy is complete. The copy can optionally be + * associated to a stream by passing a non-zero \p stream argument. If \p kind + * is ::cudaMemcpyDeviceToHost and \p stream is non-zero, the copy may overlap + * with operations in other streams. + * + * \param dst - Destination memory address + * \param symbol - Device symbol reference + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_async + * \note_string_api_deprecation + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync + */ +template +static __inline__ __host__ cudaError_t cudaMemcpyFromSymbolAsync( + void *dst, + const T &symbol, + size_t count, + size_t offset = 0, + enum cudaMemcpyKind kind = cudaMemcpyDeviceToHost, + cudaStream_t stream = 0 +) +{ + return ::cudaMemcpyFromSymbolAsync(dst, (const void*)&symbol, count, offset, kind, stream); +} + +/** + * \brief \hl Finds the address associated with a CUDA symbol + * + * Returns in \p *devPtr the address of symbol \p symbol on the device. + * \p symbol can either be a variable that resides in global or constant memory space. + * If \p symbol cannot be found, or if \p symbol is not declared + * in the global or constant memory space, \p *devPtr is unchanged and the error + * ::cudaErrorInvalidSymbol is returned. + * + * \param devPtr - Return device pointer associated with symbol + * \param symbol - Device symbol reference + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaGetSymbolAddress(void**, const void*) "cudaGetSymbolAddress (C API)", + * \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API)" + */ +template +static __inline__ __host__ cudaError_t cudaGetSymbolAddress( + void **devPtr, + const T &symbol +) +{ + return ::cudaGetSymbolAddress(devPtr, (const void*)&symbol); +} + +/** + * \brief \hl Finds the size of the object associated with a CUDA symbol + * + * Returns in \p *size the size of symbol \p symbol. \p symbol must be a + * variable that resides in global or constant memory space. + * If \p symbol cannot be found, or if \p symbol is not declared + * in global or constant memory space, \p *size is unchanged and the error + * ::cudaErrorInvalidSymbol is returned. + * + * \param size - Size of object associated with symbol + * \param symbol - Device symbol reference + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress (C++ API)", + * \ref ::cudaGetSymbolSize(size_t*, const void*) "cudaGetSymbolSize (C API)" + */ +template +static __inline__ __host__ cudaError_t cudaGetSymbolSize( + size_t *size, + const T &symbol +) +{ + return ::cudaGetSymbolSize(size, (const void*)&symbol); +} + +/** + * \brief \hl Binds a memory area to a texture + * + * Binds \p size bytes of the memory area pointed to by \p devPtr to texture + * reference \p tex. \p desc describes how the memory is interpreted when + * fetching values from the texture. The \p offset parameter is an optional + * byte offset as with the low-level + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture()" + * function. Any memory previously bound to \p tex is unbound. + * + * \param offset - Offset in bytes + * \param tex - Texture to bind + * \param devPtr - Memory area on device + * \param desc - Channel format + * \param size - Size of the memory area pointed to by devPtr + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __inline__ __host__ cudaError_t cudaBindTexture( + size_t *offset, + const struct texture &tex, + const void *devPtr, + const struct cudaChannelFormatDesc &desc, + size_t size = UINT_MAX +) +{ + return ::cudaBindTexture(offset, &tex, devPtr, &desc, size); +} + +/** + * \brief \hl Binds a memory area to a texture + * + * Binds \p size bytes of the memory area pointed to by \p devPtr to texture + * reference \p tex. The channel descriptor is inherited from the texture + * reference type. The \p offset parameter is an optional byte offset as with + * the low-level + * ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) + * function. Any memory previously bound to \p tex is unbound. + * + * \param offset - Offset in bytes + * \param tex - Texture to bind + * \param devPtr - Memory area on device + * \param size - Size of the memory area pointed to by devPtr + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __inline__ __host__ cudaError_t cudaBindTexture( + size_t *offset, + const struct texture &tex, + const void *devPtr, + size_t size = UINT_MAX +) +{ + return cudaBindTexture(offset, tex, devPtr, tex.channelDesc, size); +} + +/** + * \brief \hl Binds a 2D memory area to a texture + * + * Binds the 2D memory area pointed to by \p devPtr to the + * texture reference \p tex. The size of the area is constrained by + * \p width in texel units, \p height in texel units, and \p pitch in byte + * units. \p desc describes how the memory is interpreted when fetching values + * from the texture. Any memory previously bound to \p tex is unbound. + * + * Since the hardware enforces an alignment requirement on texture base + * addresses, + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D()" + * returns in \p *offset a byte offset that + * must be applied to texture fetches in order to read from the desired memory. + * This offset must be divided by the texel size and passed to kernels that + * read from the texture so they can be applied to the ::tex2D() function. + * If the device memory pointer was returned from ::cudaMalloc(), the offset is + * guaranteed to be 0 and NULL may be passed as the \p offset parameter. + * + * \param offset - Offset in bytes + * \param tex - Texture reference to bind + * \param devPtr - 2D memory area on device + * \param desc - Channel format + * \param width - Width in texel units + * \param height - Height in texel units + * \param pitch - Pitch in bytes + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __inline__ __host__ cudaError_t cudaBindTexture2D( + size_t *offset, + const struct texture &tex, + const void *devPtr, + const struct cudaChannelFormatDesc &desc, + size_t width, + size_t height, + size_t pitch +) +{ + return ::cudaBindTexture2D(offset, &tex, devPtr, &desc, width, height, pitch); +} + +/** + * \brief \hl Binds a 2D memory area to a texture + * + * Binds the 2D memory area pointed to by \p devPtr to the + * texture reference \p tex. The size of the area is constrained by + * \p width in texel units, \p height in texel units, and \p pitch in byte + * units. The channel descriptor is inherited from the texture reference + * type. Any memory previously bound to \p tex is unbound. + * + * Since the hardware enforces an alignment requirement on texture base + * addresses, + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D()" + * returns in \p *offset a byte offset that + * must be applied to texture fetches in order to read from the desired memory. + * This offset must be divided by the texel size and passed to kernels that + * read from the texture so they can be applied to the ::tex2D() function. + * If the device memory pointer was returned from ::cudaMalloc(), the offset is + * guaranteed to be 0 and NULL may be passed as the \p offset parameter. + * + * \param offset - Offset in bytes + * \param tex - Texture reference to bind + * \param devPtr - 2D memory area on device + * \param width - Width in texel units + * \param height - Height in texel units + * \param pitch - Pitch in bytes + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __inline__ __host__ cudaError_t cudaBindTexture2D( + size_t *offset, + const struct texture &tex, + const void *devPtr, + size_t width, + size_t height, + size_t pitch +) +{ + return ::cudaBindTexture2D(offset, &tex, devPtr, &tex.channelDesc, width, height, pitch); +} + +/** + * \brief \hl Binds an array to a texture + * + * Binds the CUDA array \p array to the texture reference \p tex. + * \p desc describes how the memory is interpreted when fetching values from + * the texture. Any CUDA array previously bound to \p tex is unbound. + * + * \param tex - Texture to bind + * \param array - Memory array on device + * \param desc - Channel format + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __inline__ __host__ cudaError_t cudaBindTextureToArray( + const struct texture &tex, + cudaArray_const_t array, + const struct cudaChannelFormatDesc &desc +) +{ + return ::cudaBindTextureToArray(&tex, array, &desc); +} + +/** + * \brief \hl Binds an array to a texture + * + * Binds the CUDA array \p array to the texture reference \p tex. + * The channel descriptor is inherited from the CUDA array. Any CUDA array + * previously bound to \p tex is unbound. + * + * \param tex - Texture to bind + * \param array - Memory array on device + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __inline__ __host__ cudaError_t cudaBindTextureToArray( + const struct texture &tex, + cudaArray_const_t array +) +{ + struct cudaChannelFormatDesc desc; + cudaError_t err = ::cudaGetChannelDesc(&desc, array); + + return err == cudaSuccess ? cudaBindTextureToArray(tex, array, desc) : err; +} + +/** + * \brief \hl Binds a mipmapped array to a texture + * + * Binds the CUDA mipmapped array \p mipmappedArray to the texture reference \p tex. + * \p desc describes how the memory is interpreted when fetching values from + * the texture. Any CUDA mipmapped array previously bound to \p tex is unbound. + * + * \param tex - Texture to bind + * \param mipmappedArray - Memory mipmapped array on device + * \param desc - Channel format + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __inline__ __host__ cudaError_t cudaBindTextureToMipmappedArray( + const struct texture &tex, + cudaMipmappedArray_const_t mipmappedArray, + const struct cudaChannelFormatDesc &desc +) +{ + return ::cudaBindTextureToMipmappedArray(&tex, mipmappedArray, &desc); +} + +/** + * \brief \hl Binds a mipmapped array to a texture + * + * Binds the CUDA mipmapped array \p mipmappedArray to the texture reference \p tex. + * The channel descriptor is inherited from the CUDA array. Any CUDA mipmapped array + * previously bound to \p tex is unbound. + * + * \param tex - Texture to bind + * \param mipmappedArray - Memory mipmapped array on device + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __inline__ __host__ cudaError_t cudaBindTextureToMipmappedArray( + const struct texture &tex, + cudaMipmappedArray_const_t mipmappedArray +) +{ + struct cudaChannelFormatDesc desc; + cudaArray_t levelArray; + cudaError_t err = ::cudaGetMipmappedArrayLevel(&levelArray, mipmappedArray, 0); + + if (err != cudaSuccess) { + return err; + } + err = ::cudaGetChannelDesc(&desc, levelArray); + + return err == cudaSuccess ? cudaBindTextureToMipmappedArray(tex, mipmappedArray, desc) : err; +} + +/** + * \brief \hl Unbinds a texture + * + * Unbinds the texture bound to \p tex. If \p texref is not currently bound, no operation is performed. + * + * \param tex - Texture to unbind + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __inline__ __host__ cudaError_t cudaUnbindTexture( + const struct texture &tex +) +{ + return ::cudaUnbindTexture(&tex); +} + +/** + * \brief \hl Get the alignment offset of a texture + * + * Returns in \p *offset the offset that was returned when texture reference + * \p tex was bound. + * + * \param offset - Offset of texture reference in bytes + * \param tex - Texture to get offset of + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidTexture, + * ::cudaErrorInvalidTextureBinding + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)" + */ +template +static __inline__ __host__ cudaError_t cudaGetTextureAlignmentOffset( + size_t *offset, + const struct texture &tex +) +{ + return ::cudaGetTextureAlignmentOffset(offset, &tex); +} + +/** + * \brief \hl Sets the preferred cache configuration for a device function + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this sets through \p cacheConfig the preferred cache configuration + * for the function specified via \p func. This is only a preference. The + * runtime will use the requested configuration if possible, but it is free to + * choose a different configuration if required to execute \p func. + * + * \p func must be a pointer to a function that executes on the device. + * The parameter specified by \p func must be declared as a \p __global__ + * function. If the specified function does not exist, + * then ::cudaErrorInvalidDeviceFunction is returned. + * + * This setting does nothing on devices where the size of the L1 cache and + * shared memory are fixed. + * + * Launching a kernel with a different preference than the most recent + * preference setting may insert a device-side synchronization point. + * + * The supported cache configurations are: + * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default) + * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache + * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory + * + * \param func - device function pointer + * \param cacheConfig - Requested cache configuration + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction + * \notefnerr + * \note_init_rt + * \note_callback + * + * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)", + * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)", + * ::cudaSetDoubleForDevice, + * ::cudaSetDoubleForHost, + * ::cudaThreadGetCacheConfig, + * ::cudaThreadSetCacheConfig + */ +template +static __inline__ __host__ cudaError_t cudaFuncSetCacheConfig( + T *func, + enum cudaFuncCache cacheConfig +) +{ + return ::cudaFuncSetCacheConfig((const void*)func, cacheConfig); +} + +template +static __inline__ __host__ cudaError_t cudaFuncSetSharedMemConfig( + T *func, + enum cudaSharedMemConfig config +) +{ + return ::cudaFuncSetSharedMemConfig((const void*)func, config); +} + +/** + * \brief Returns occupancy for a device function + * + * Returns in \p *numBlocks the maximum number of active blocks per + * streaming multiprocessor for the device function. + * + * \param numBlocks - Returned occupancy + * \param func - Kernel function for which occupancy is calculated + * \param blockSize - Block size the kernel is intended to be launched with + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSize + * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags + */ +template +static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor( + int *numBlocks, + T func, + int blockSize, + size_t dynamicSMemSize) +{ + return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, cudaOccupancyDefault); +} + +/** + * \brief Returns occupancy for a device function with the specified flags + * + * Returns in \p *numBlocks the maximum number of active blocks per + * streaming multiprocessor for the device function. + * + * The \p flags parameter controls how special cases are handled. Valid flags include: + * + * - ::cudaOccupancyDefault: keeps the default behavior as + * ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * + * - ::cudaOccupancyDisableCachingOverride: suppresses the default behavior + * on platform where global caching affects occupancy. On such platforms, if caching + * is enabled, but per-block SM resource usage would result in zero occupancy, the + * occupancy calculator will calculate the occupancy as if caching is disabled. + * Setting this flag makes the occupancy calculator to return 0 in such cases. + * More information can be found about this feature in the "Unified L1/Texture Cache" + * section of the Maxwell tuning guide. + * + * \param numBlocks - Returned occupancy + * \param func - Kernel function for which occupancy is calculated + * \param blockSize - Block size the kernel is intended to be launched with + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * \param flags - Requested behavior for the occupancy calculator + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * \sa ::cudaOccupancyMaxPotentialBlockSize + * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags + */ +template +static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + int *numBlocks, + T func, + int blockSize, + size_t dynamicSMemSize, + unsigned int flags) +{ + return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, flags); +} + +/** + * Helper functor for cudaOccupancyMaxPotentialBlockSize + */ +class __cudaOccupancyB2DHelper { + size_t n; +public: + inline __host__ CUDART_DEVICE __cudaOccupancyB2DHelper(size_t n_) : n(n_) {} + inline __host__ CUDART_DEVICE size_t operator()(int) + { + return n; + } +}; + +/** + * \brief Returns grid and block size that achieves maximum potential occupancy for a device function + * + * Returns in \p *minGridSize and \p *blocksize a suggested grid / + * block size pair that achieves the best potential occupancy + * (i.e. the maximum number of active warps with the smallest number + * of blocks). + * + * The \p flags parameter controls how special cases are handled. Valid flags include: + * + * - ::cudaOccupancyDefault: keeps the default behavior as + * ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags + * + * - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior + * on platform where global caching affects occupancy. On such platforms, if caching + * is enabled, but per-block SM resource usage would result in zero occupancy, the + * occupancy calculator will calculate the occupancy as if caching is disabled. + * Setting this flag makes the occupancy calculator to return 0 in such cases. + * More information can be found about this feature in the "Unified L1/Texture Cache" + * section of the Maxwell tuning guide. + * + * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy + * \param blockSize - Returned block size + * \param func - Device function symbol + * \param blockSizeToDynamicSMemSize - A unary function / functor that takes block size, and returns the size, in bytes, of dynamic shared memory needed for a block + * \param blockSizeLimit - The maximum block size \p func is designed to work with. 0 means no limit. + * \param flags - Requested behavior for the occupancy calculator + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSize + * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags + */ + +template +static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags( + int *minGridSize, + int *blockSize, + T func, + UnaryFunction blockSizeToDynamicSMemSize, + int blockSizeLimit = 0, + unsigned int flags = 0) +{ + cudaError_t status; + + // Device and function properties + int device; + struct cudaFuncAttributes attr; + + // Limits + int maxThreadsPerMultiProcessor; + int warpSize; + int devMaxThreadsPerBlock; + int multiProcessorCount; + int funcMaxThreadsPerBlock; + int occupancyLimit; + int granularity; + + // Recorded maximum + int maxBlockSize = 0; + int numBlocks = 0; + int maxOccupancy = 0; + + // Temporary + int blockSizeToTryAligned; + int blockSizeToTry; + int blockSizeLimitAligned; + int occupancyInBlocks; + int occupancyInThreads; + size_t dynamicSMemSize; + + /////////////////////////// + // Check user input + /////////////////////////// + + if (!minGridSize || !blockSize || !func) { + return cudaErrorInvalidValue; + } + + ////////////////////////////////////////////// + // Obtain device and function properties + ////////////////////////////////////////////// + + status = ::cudaGetDevice(&device); + if (status != cudaSuccess) { + return status; + } + + status = cudaDeviceGetAttribute( + &maxThreadsPerMultiProcessor, + cudaDevAttrMaxThreadsPerMultiProcessor, + device); + if (status != cudaSuccess) { + return status; + } + + status = cudaDeviceGetAttribute( + &warpSize, + cudaDevAttrWarpSize, + device); + if (status != cudaSuccess) { + return status; + } + + status = cudaDeviceGetAttribute( + &devMaxThreadsPerBlock, + cudaDevAttrMaxThreadsPerBlock, + device); + if (status != cudaSuccess) { + return status; + } + + status = cudaDeviceGetAttribute( + &multiProcessorCount, + cudaDevAttrMultiProcessorCount, + device); + if (status != cudaSuccess) { + return status; + } + + status = cudaFuncGetAttributes(&attr, func); + if (status != cudaSuccess) { + return status; + } + + funcMaxThreadsPerBlock = attr.maxThreadsPerBlock; + + ///////////////////////////////////////////////////////////////////////////////// + // Try each block size, and pick the block size with maximum occupancy + ///////////////////////////////////////////////////////////////////////////////// + + occupancyLimit = maxThreadsPerMultiProcessor; + granularity = warpSize; + + if (blockSizeLimit == 0) { + blockSizeLimit = devMaxThreadsPerBlock; + } + + if (devMaxThreadsPerBlock < blockSizeLimit) { + blockSizeLimit = devMaxThreadsPerBlock; + } + + if (funcMaxThreadsPerBlock < blockSizeLimit) { + blockSizeLimit = funcMaxThreadsPerBlock; + } + + blockSizeLimitAligned = ((blockSizeLimit + (granularity - 1)) / granularity) * granularity; + + for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) { + // This is needed for the first iteration, because + // blockSizeLimitAligned could be greater than blockSizeLimit + // + if (blockSizeLimit < blockSizeToTryAligned) { + blockSizeToTry = blockSizeLimit; + } else { + blockSizeToTry = blockSizeToTryAligned; + } + + dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry); + + status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + &occupancyInBlocks, + func, + blockSizeToTry, + dynamicSMemSize, + flags); + + if (status != cudaSuccess) { + return status; + } + + occupancyInThreads = blockSizeToTry * occupancyInBlocks; + + if (occupancyInThreads > maxOccupancy) { + maxBlockSize = blockSizeToTry; + numBlocks = occupancyInBlocks; + maxOccupancy = occupancyInThreads; + } + + // Early out if we have reached the maximum + // + if (occupancyLimit == maxOccupancy) { + break; + } + } + + /////////////////////////// + // Return best available + /////////////////////////// + + // Suggested min grid size to achieve a full machine launch + // + *minGridSize = numBlocks * multiProcessorCount; + *blockSize = maxBlockSize; + + return status; +} + +/** + * \brief Returns grid and block size that achieves maximum potential occupancy for a device function + * + * Returns in \p *minGridSize and \p *blocksize a suggested grid / + * block size pair that achieves the best potential occupancy + * (i.e. the maximum number of active warps with the smallest number + * of blocks). + * + * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy + * \param blockSize - Returned block size + * \param func - Device function symbol + * \param blockSizeToDynamicSMemSize - A unary function / functor that takes block size, and returns the size, in bytes, of dynamic shared memory needed for a block + * \param blockSizeLimit - The maximum block size \p func is designed to work with. 0 means no limit. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSize + * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags + */ + +template +static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMem( + int *minGridSize, + int *blockSize, + T func, + UnaryFunction blockSizeToDynamicSMemSize, + int blockSizeLimit = 0) +{ + return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, blockSizeToDynamicSMemSize, blockSizeLimit, cudaOccupancyDefault); +} + +/** + * \brief Returns grid and block size that achieves maximum potential occupancy for a device function + * + * Returns in \p *minGridSize and \p *blocksize a suggested grid / + * block size pair that achieves the best potential occupancy + * (i.e. the maximum number of active warps with the smallest number + * of blocks). + * + * Use \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem if the + * amount of per-block dynamic shared memory changes with different + * block sizes. + * + * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy + * \param blockSize - Returned block size + * \param func - Device function symbol + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * \param blockSizeLimit - The maximum block size \p func is designed to work with. 0 means no limit. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags + */ +template +static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSize( + int *minGridSize, + int *blockSize, + T func, + size_t dynamicSMemSize = 0, + int blockSizeLimit = 0) +{ + return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, cudaOccupancyDefault); +} + +/** + * \brief Returns grid and block size that achieved maximum potential occupancy for a device function with the specified flags + * + * Returns in \p *minGridSize and \p *blocksize a suggested grid / + * block size pair that achieves the best potential occupancy + * (i.e. the maximum number of active warps with the smallest number + * of blocks). + * + * The \p flags parameter controls how special cases are handle. Valid flags include: + * + * - ::cudaOccupancyDefault: keeps the default behavior as + * ::cudaOccupancyMaxPotentialBlockSize + * + * - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior + * on platform where global caching affects occupancy. On such platforms, if caching + * is enabled, but per-block SM resource usage would result in zero occupancy, the + * occupancy calculator will calculate the occupancy as if caching is disabled. + * Setting this flag makes the occupancy calculator to return 0 in such cases. + * More information can be found about this feature in the "Unified L1/Texture Cache" + * section of the Maxwell tuning guide. + * + * Use \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem if the + * amount of per-block dynamic shared memory changes with different + * block sizes. + * + * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy + * \param blockSize - Returned block size + * \param func - Device function symbol + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * \param blockSizeLimit - The maximum block size \p func is designed to work with. 0 means no limit. + * \param flags - Requested behavior for the occupancy calculator + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaOccupancyMaxPotentialBlockSize + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags + */ +template +static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeWithFlags( + int *minGridSize, + int *blockSize, + T func, + size_t dynamicSMemSize = 0, + int blockSizeLimit = 0, + unsigned int flags = 0) +{ + return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, flags); +} + +/** + * \brief \hl Find out attributes for a given function + * + * This function obtains the attributes of a function specified via \p entry. + * The parameter \p entry must be a pointer to a function that executes + * on the device. The parameter specified by \p entry must be declared as a \p __global__ + * function. The fetched attributes are placed in \p attr. If the specified + * function does not exist, then ::cudaErrorInvalidDeviceFunction is returned. + * + * Note that some function attributes such as + * \ref ::cudaFuncAttributes::maxThreadsPerBlock "maxThreadsPerBlock" + * may vary based on the device that is currently being used. + * + * \param attr - Return pointer to function's attributes + * \param entry - Function to get attributes of + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction + * \notefnerr + * \note_init_rt + * \note_callback + * + * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)", + * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", + * ::cudaSetDoubleForDevice, + * ::cudaSetDoubleForHost + */ +template +static __inline__ __host__ cudaError_t cudaFuncGetAttributes( + struct cudaFuncAttributes *attr, + T *entry +) +{ + return ::cudaFuncGetAttributes(attr, (const void*)entry); +} + +/** + * \brief \hl Set attributes for a given function + * + * This function sets the attributes of a function specified via \p entry. + * The parameter \p entry must be a pointer to a function that executes + * on the device. The parameter specified by \p entry must be declared as a \p __global__ + * function. The enumeration defined by \p attr is set to the value defined by \p value. + * If the specified function does not exist, then ::cudaErrorInvalidDeviceFunction is returned. + * If the specified attribute cannot be written, or if the value is incorrect, + * then ::cudaErrorInvalidValue is returned. + * + * Valid values for \p attr are: + * - ::cudaFuncAttributeMaxDynamicSharedMemorySize - The requested maximum size in bytes of dynamically-allocated shared memory. The sum of this value and the function attribute ::sharedSizeBytes + * cannot exceed the device attribute ::cudaDevAttrMaxSharedMemoryPerBlockOptin. The maximal size of requestable dynamic shared memory may differ by GPU architecture. + * - ::cudaFuncAttributePreferredSharedMemoryCarveout - On devices where the L1 cache and shared memory use the same hardware resources, + * this sets the shared memory carveout preference, in percent of the total shared memory. See ::cudaDevAttrMaxSharedMemoryPerMultiprocessor. + * This is only a hint, and the driver can choose a different ratio if required to execute the function. + * + * \param entry - Function to get attributes of + * \param attr - Attribute to set + * \param value - Value to set + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)", + * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", + * ::cudaSetDoubleForDevice, + * ::cudaSetDoubleForHost + */ +template +static __inline__ __host__ cudaError_t cudaFuncSetAttribute( + T *entry, + enum cudaFuncAttribute attr, + int value +) +{ + return ::cudaFuncSetAttribute((const void*)entry, attr, value); +} + +/** + * \brief \hl Binds an array to a surface + * + * Binds the CUDA array \p array to the surface reference \p surf. + * \p desc describes how the memory is interpreted when dealing with + * the surface. Any CUDA array previously bound to \p surf is unbound. + * + * \param surf - Surface to bind + * \param array - Memory array on device + * \param desc - Channel format + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSurface + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToArray (C API)", + * \ref ::cudaBindSurfaceToArray(const struct surface&, cudaArray_const_t) "cudaBindSurfaceToArray (C++ API, inherited channel descriptor)" + */ +template +static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindSurfaceToArray( + const struct surface &surf, + cudaArray_const_t array, + const struct cudaChannelFormatDesc &desc +) +{ + return ::cudaBindSurfaceToArray(&surf, array, &desc); +} + +/** + * \brief \hl Binds an array to a surface + * + * Binds the CUDA array \p array to the surface reference \p surf. + * The channel descriptor is inherited from the CUDA array. Any CUDA array + * previously bound to \p surf is unbound. + * + * \param surf - Surface to bind + * \param array - Memory array on device + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSurface + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToArray (C API)", + * \ref ::cudaBindSurfaceToArray(const struct surface&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindSurfaceToArray (C++ API)" + */ +template +static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindSurfaceToArray( + const struct surface &surf, + cudaArray_const_t array +) +{ + struct cudaChannelFormatDesc desc; + cudaError_t err = ::cudaGetChannelDesc(&desc, array); + + return err == cudaSuccess ? cudaBindSurfaceToArray(surf, array, desc) : err; +} + +#endif /* __CUDACC__ */ + +/** @} */ /* END CUDART_HIGHLEVEL */ + +#endif /* __cplusplus && !__CUDACC_RTC__ */ + +#if !defined(__CUDACC_RTC__) +#if defined(__GNUC__) +#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))) +#pragma GCC diagnostic pop +#endif +#elif defined(_MSC_VER) +#pragma warning(pop) +#endif +#endif + +#undef __CUDA_DEPRECATED + +#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_H__) +#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_H__ +#endif + +#endif /* !__CUDA_RUNTIME_H__ */ diff --git a/icicle/curves/bn254/curve_config.cuh b/icicle/curves/bn254/curve_config.cuh index f1e26172a..adc8729f4 100644 --- a/icicle/curves/bn254/curve_config.cuh +++ b/icicle/curves/bn254/curve_config.cuh @@ -2,6 +2,9 @@ #include "../../primitives/field.cuh" #include "../../primitives/projective.cuh" +#if defined(G2_DEFINED) +#include "../../primitives/extension_field.cuh" +#endif #include "params.cuh" diff --git a/icicle/curves/bn254/lde.cu b/icicle/curves/bn254/lde.cu index 302be7139..da76e69b6 100644 --- a/icicle/curves/bn254/lde.cu +++ b/icicle/curves/bn254/lde.cu @@ -5,6 +5,9 @@ #include "../../appUtils/ntt/ntt.cuh" #include "../../appUtils/vector_manipulation/ve_mod_mult.cuh" #include "curve_config.cuh" +#include "../../utils/mont.cuh" + + extern "C" BN254::scalar_t* build_domain_cuda_bn254(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) { @@ -24,7 +27,7 @@ extern "C" BN254::scalar_t* build_domain_cuda_bn254(uint32_t domain_size, uint32 } } -extern "C" int ntt_cuda_bn254(BN254::scalar_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) +extern "C" int ntt_cuda_bn254(BN254::scalar_t *arr, uint32_t n, bool inverse, Decimation decimation, size_t device_id = 0, cudaStream_t stream = 0) { try { @@ -39,7 +42,7 @@ extern "C" int ntt_cuda_bn254(BN254::scalar_t *arr, uint32_t n, bool inverse, si } } -extern "C" int ecntt_cuda_bn254(BN254::projective_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) +extern "C" int ecntt_cuda_bn254(BN254::projective_t *arr, uint32_t n, bool inverse, Decimation decimation, size_t device_id = 0, cudaStream_t stream = 0) { try { @@ -85,7 +88,8 @@ extern "C" int interpolate_scalars_cuda_bn254(BN254::scalar_t* d_out, BN254::sca { try { - return interpolate(d_out, d_evaluations, d_domain, n, stream); + BN254::scalar_t* _null = nullptr; + return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -99,8 +103,37 @@ extern "C" int interpolate_scalars_batch_cuda_bn254(BN254::scalar_t* d_out, BN25 { try { + BN254::scalar_t* _null = nullptr; cudaStreamCreate(&stream); - return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream); + return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream); + } + catch (const std::runtime_error &ex) + { + printf("error %s", ex.what()); + return -1; + } +} + +extern "C" int interpolate_scalars_on_coset_cuda_bn254(BN254::scalar_t* d_out, BN254::scalar_t *d_evaluations, BN254::scalar_t *d_domain, unsigned n, BN254::scalar_t *coset_powers, unsigned device_id = 0, cudaStream_t stream = 0) +{ + try + { + return interpolate(d_out, d_evaluations, d_domain, n, true, coset_powers, stream); + } + catch (const std::runtime_error &ex) + { + printf("error %s", ex.what()); + return -1; + } +} + +extern "C" int interpolate_scalars_batch_on_coset_cuda_bn254(BN254::scalar_t* d_out, BN254::scalar_t* d_evaluations, BN254::scalar_t* d_domain, unsigned n, + unsigned batch_size, BN254::scalar_t* coset_powers, size_t device_id = 0, cudaStream_t stream = 0) +{ + try + { + cudaStreamCreate(&stream); + return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, true, coset_powers, stream); } catch (const std::runtime_error &ex) { @@ -113,7 +146,8 @@ extern "C" int interpolate_points_cuda_bn254(BN254::projective_t* d_out, BN254:: { try { - return interpolate(d_out, d_evaluations, d_domain, n, stream); + BN254::scalar_t* _null = nullptr; + return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -127,8 +161,9 @@ extern "C" int interpolate_points_batch_cuda_bn254(BN254::projective_t* d_out, B { try { + BN254::scalar_t* _null = nullptr; cudaStreamCreate(&stream); - return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream); + return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -266,8 +301,10 @@ extern "C" int ntt_inplace_batch_cuda_bn254(BN254::scalar_t* d_inout, BN254::sca { try { + cudaStreamCreate(&stream); - ntt_inplace_batch_template(d_inout, d_twiddles, n, batch_size, inverse, stream, true); + BN254::scalar_t* _null = nullptr; + ntt_inplace_batch_template(d_inout, d_twiddles, n, batch_size, inverse, false, _null, stream, true); return CUDA_SUCCESS; //TODO: we should implement this https://leimao.github.io/blog/Proper-CUDA-Error-Checking/ } catch (const std::runtime_error &ex) @@ -277,6 +314,192 @@ extern "C" int ntt_inplace_batch_cuda_bn254(BN254::scalar_t* d_inout, BN254::sca } } +extern "C" int ntt_inplace_coset_batch_cuda_bn254(BN254::scalar_t* d_inout, BN254::scalar_t* d_twiddles, + unsigned n, unsigned batch_size, bool inverse, bool is_coset, BN254::scalar_t* coset, size_t device_id = 0, cudaStream_t stream = 0) +{ + try + { + cudaStreamCreate(&stream); + ntt_inplace_batch_template(d_inout, d_twiddles, n, batch_size, inverse, is_coset, coset, stream, true); + return CUDA_SUCCESS; //TODO: we should implement this https://leimao.github.io/blog/Proper-CUDA-Error-Checking/ + } + catch (const std::runtime_error &ex) + { + printf("error %s", ex.what()); + return -1; + } +} + +extern "C" int sub_scalars_cuda_bn254(BN254::scalar_t* d_out, BN254::scalar_t* d_in1, BN254::scalar_t* d_in2, unsigned n, cudaStream_t stream = 0) +{ + try + { + cudaStreamCreate(&stream); + return sub_polys(d_out, d_in1, d_in2, n, stream); + } + catch (const std::runtime_error &ex) + { + printf("error %s", ex.what()); + return -1; + } +} + +extern "C" int add_scalars_cuda_bn254(BN254::scalar_t* d_out, BN254::scalar_t* d_in1, BN254::scalar_t* d_in2, unsigned n, cudaStream_t stream = 0) +{ + try + { + cudaStreamCreate(&stream); + return add_polys(d_out, d_in1, d_in2, n, stream); + } + catch (const std::runtime_error &ex) + { + printf("error %s", ex.what()); + return -1; + } +} + +extern "C" int to_montgomery_scalars_cuda_bn254(BN254::scalar_t* d_inout, unsigned n, cudaStream_t stream = 0) +{ + try + { + cudaStreamCreate(&stream); + return to_montgomery(d_inout, n, stream); + } + catch (const std::runtime_error &ex) + { + printf("error %s", ex.what()); + return -1; + } +} + +extern "C" int from_montgomery_scalars_cuda_bn254(BN254::scalar_t* d_inout, unsigned n, cudaStream_t stream = 0) +{ + try + { + cudaStreamCreate(&stream); + return from_montgomery(d_inout, n, stream); + } + catch (const std::runtime_error &ex) + { + printf("error %s", ex.what()); + return -1; + } +} + +extern "C" int to_montgomery_proj_points_cuda_bn254(BN254::projective_t* d_inout, unsigned n, cudaStream_t stream = 0) +{ + try + { + cudaStreamCreate(&stream); + return to_montgomery((BN254::point_field_t*)d_inout, 3 * n, stream); + } + catch (const std::runtime_error &ex) + { + printf("error %s", ex.what()); + return -1; + } +} + +extern "C" int from_montgomery_proj_points_cuda_bn254(BN254::projective_t* d_inout, unsigned n, cudaStream_t stream = 0) +{ + try + { + cudaStreamCreate(&stream); + return from_montgomery((BN254::point_field_t*)d_inout, 3 * n, stream); + } + catch (const std::runtime_error &ex) + { + printf("error %s", ex.what()); + return -1; + } +} + +extern "C" int to_montgomery_aff_points_cuda_bn254(BN254::affine_t* d_inout, unsigned n, cudaStream_t stream = 0) +{ + try + { + cudaStreamCreate(&stream); + return to_montgomery((BN254::point_field_t*)d_inout, 2 * n, stream); + } + catch (const std::runtime_error &ex) + { + printf("error %s", ex.what()); + return -1; + } +} + +extern "C" int from_montgomery_aff_points_cuda_bn254(BN254::affine_t* d_inout, unsigned n, cudaStream_t stream = 0) +{ + try + { + cudaStreamCreate(&stream); + return from_montgomery((BN254::point_field_t*)d_inout, 2 * n, stream); + } + catch (const std::runtime_error &ex) + { + printf("error %s", ex.what()); + return -1; + } +} + +#if defined(G2_DEFINED) +extern "C" int to_montgomery_proj_points_g2_cuda_bn254(BN254::g2_projective_t* d_inout, unsigned n, cudaStream_t stream = 0) +{ + try + { + cudaStreamCreate(&stream); + return to_montgomery((BN254::point_field_t*)d_inout, 6 * n, stream); + } + catch (const std::runtime_error &ex) + { + printf("error %s", ex.what()); + return -1; + } +} + +extern "C" int from_montgomery_proj_points_g2_cuda_bn254(BN254::g2_projective_t* d_inout, unsigned n, cudaStream_t stream = 0) +{ + try + { + cudaStreamCreate(&stream); + return from_montgomery((BN254::point_field_t*)d_inout, 6 * n, stream); + } + catch (const std::runtime_error &ex) + { + printf("error %s", ex.what()); + return -1; + } +} + +extern "C" int to_montgomery_aff_points_g2_cuda_bn254(BN254::g2_affine_t* d_inout, unsigned n, cudaStream_t stream = 0) +{ + try + { + cudaStreamCreate(&stream); + return to_montgomery((BN254::point_field_t*)d_inout, 4 * n, stream); + } + catch (const std::runtime_error &ex) + { + printf("error %s", ex.what()); + return -1; + } +} + +extern "C" int from_montgomery_aff_points_g2_cuda_bn254(BN254::g2_affine_t* d_inout, unsigned n, cudaStream_t stream = 0) +{ + try + { + cudaStreamCreate(&stream); + return from_montgomery((BN254::point_field_t*)d_inout, 4 * n, stream); + } + catch (const std::runtime_error &ex) + { + printf("error %s", ex.what()); + return -1; + } +} +#endif + extern "C" int reverse_order_scalars_cuda_bn254(BN254::scalar_t* arr, int n, size_t device_id = 0, cudaStream_t stream = 0) { try @@ -284,6 +507,7 @@ extern "C" int reverse_order_scalars_cuda_bn254(BN254::scalar_t* arr, int n, siz uint32_t logn = uint32_t(log(n) / log(2)); cudaStreamCreate(&stream); reverse_order(arr, n, logn, stream); + cudaStreamSynchronize(stream); return 0; } catch (const std::runtime_error &ex) diff --git a/icicle/curves/bn254/msm.cu b/icicle/curves/bn254/msm.cu index 8de1c4bb3..3133b7b7d 100644 --- a/icicle/curves/bn254/msm.cu +++ b/icicle/curves/bn254/msm.cu @@ -11,8 +11,10 @@ int msm_cuda_bn254(BN254::projective_t *out, BN254::affine_t points[], BN254::scalar_t scalars[], size_t count, size_t device_id = 0, cudaStream_t stream = 0) { try - { - large_msm(scalars, points, count, out, false, stream); + { + cudaStreamCreate(&stream); + large_msm(scalars, points, count, out, false, false, stream); + cudaStreamSynchronize(stream); return CUDA_SUCCESS; } catch (const std::runtime_error &ex) @@ -25,18 +27,18 @@ int msm_cuda_bn254(BN254::projective_t *out, BN254::affine_t points[], extern "C" int msm_batch_cuda_bn254(BN254::projective_t* out, BN254::affine_t points[], BN254::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0, cudaStream_t stream = 0) { - try - { - cudaStreamCreate(&stream); - batched_large_msm(scalars, points, batch_size, msm_size, out, false, stream); - cudaStreamSynchronize(stream); - return CUDA_SUCCESS; - } - catch (const std::runtime_error &ex) - { - printf("error %s", ex.what()); - return -1; - } + try + { + cudaStreamCreate(&stream); + batched_large_msm(scalars, points, batch_size, msm_size, out, false, stream); + cudaStreamSynchronize(stream); + return CUDA_SUCCESS; + } + catch (const std::runtime_error &ex) + { + printf("error %s", ex.what()); + return -1; + } } /** @@ -47,46 +49,138 @@ extern "C" int msm_batch_cuda_bn254(BN254::projective_t* out, BN254::affine_t po * @param d_points Points for the MSM. Must be on device. * @param count Length of `d_scalars` and `d_points` arrays (they should have equal length). */ - extern "C" - int commit_cuda_bn254(BN254::projective_t* d_out, BN254::scalar_t* d_scalars, BN254::affine_t* d_points, size_t count, size_t device_id = 0, cudaStream_t stream = 0) - { - try - { - large_msm(d_scalars, d_points, count, d_out, true, stream); - cudaStreamSynchronize(stream); - return 0; - } - catch (const std::runtime_error &ex) - { - printf("error %s", ex.what()); - return -1; - } - } +extern "C" +int commit_cuda_bn254(BN254::projective_t* d_out, BN254::scalar_t* d_scalars, BN254::affine_t* d_points, size_t count, size_t device_id = 0, cudaStream_t stream = 0) +{ + try + { + cudaStreamCreate(&stream); + large_msm(d_scalars, d_points, count, d_out, true, false, stream); + cudaStreamSynchronize(stream); + return CUDA_SUCCESS; + } + catch (const std::runtime_error &ex) + { + printf("error %s", ex.what()); + return -1; + } +} + +/** + * Commit to a batch of polynomials using the MSM. + * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or points. + * @param d_out Ouptut point to write the results to. + * @param d_scalars Scalars for the MSMs of all polynomials. Must be on device. + * @param d_points Points for the MSMs. Must be on device. It is assumed that this set of bases is used for each MSM. + * @param count Length of `d_points` array, `d_scalar` has length `count` * `batch_size`. + * @param batch_size Size of the batch. + */ +extern "C" +int commit_batch_cuda_bn254(BN254::projective_t* d_out, BN254::scalar_t* d_scalars, BN254::affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0, cudaStream_t stream = 0) +{ + try + { + cudaStreamCreate(&stream); + batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream); + cudaStreamSynchronize(stream); + return CUDA_SUCCESS; + } + catch (const std::runtime_error &ex) + { + printf("error %s", ex.what()); + return -1; + } +} + +#if defined(G2_DEFINED) +extern "C" +int msm_g2_cuda_bn254(BN254::g2_projective_t *out, BN254::g2_affine_t points[], + BN254::scalar_t scalars[], size_t count, size_t device_id = 0, cudaStream_t stream = 0) +{ + try + { + cudaStreamCreate(&stream); + large_msm(scalars, points, count, out, false, false, stream); + cudaStreamSynchronize(stream); + return CUDA_SUCCESS; + } + catch (const std::runtime_error &ex) + { + printf("error %s", ex.what()); + return -1; + } +} + +extern "C" int msm_batch_g2_cuda_bn254(BN254::g2_projective_t* out, BN254::g2_affine_t points[], + BN254::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0, cudaStream_t stream = 0) +{ + try + { + cudaStreamCreate(&stream); + batched_large_msm(scalars, points, batch_size, msm_size, out, false, stream); + cudaStreamSynchronize(stream); + return CUDA_SUCCESS; + } + catch (const std::runtime_error &ex) + { + printf("error %s", ex.what()); + return -1; + } +} + +/** + * Commit to a polynomial using the MSM in G2 group. + * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or points. + * @param d_out Ouptut G2 point to write the result to. + * @param d_scalars Scalars for the MSM. Must be on device. + * @param d_points G2 affine points for the MSM. Must be on device. + * @param count Length of `d_scalars` and `d_points` arrays (they should have equal length). + */ +extern "C" +int commit_g2_cuda_bn254(BN254::g2_projective_t* d_out, BN254::scalar_t* d_scalars, BN254::g2_affine_t* d_points, size_t count, size_t device_id = 0, cudaStream_t stream = 0) +{ + // TODO: use device_id when working with multiple devices + (void)device_id; + try + { + cudaStreamCreate(&stream); + large_msm(d_scalars, d_points, count, d_out, true, false, stream); + cudaStreamSynchronize(stream); + return CUDA_SUCCESS; + } + catch (const std::runtime_error &ex) + { + printf("error %s", ex.what()); + return -1; + } +} /** * Commit to a batch of polynomials using the MSM. * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or points. - * @param d_out Ouptut point to write the results to. + * @param d_out Ouptut G2 point to write the results to. * @param d_scalars Scalars for the MSMs of all polynomials. Must be on device. - * @param d_points Points for the MSMs. Must be on device. It is assumed that this set of bases is used for each MSM. + * @param d_points G2 affine points for the MSMs. Must be on device. It is assumed that this set of bases is used for each MSM. * @param count Length of `d_points` array, `d_scalar` has length `count` * `batch_size`. * @param batch_size Size of the batch. */ - extern "C" - int commit_batch_cuda_bn254(BN254::projective_t* d_out, BN254::scalar_t* d_scalars, BN254::affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0, cudaStream_t stream = 0) - { - try - { +extern "C" +int commit_batch_g2_cuda_bn254(BN254::g2_projective_t* d_out, BN254::scalar_t* d_scalars, BN254::g2_affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0, cudaStream_t stream = 0) +{ + // TODO: use device_id when working with multiple devices + (void)device_id; + try + { cudaStreamCreate(&stream); - batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream); - cudaStreamSynchronize(stream); - return 0; - } - catch (const std::runtime_error &ex) - { - printf("error %s", ex.what()); - return -1; - } - } - - #endif + batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream); + cudaStreamSynchronize(stream); + return CUDA_SUCCESS; + } + catch (const std::runtime_error &ex) + { + printf("error %s", ex.what()); + return -1; + } +} +#endif +#endif diff --git a/icicle/curves/bn254/msm.h b/icicle/curves/bn254/msm.h new file mode 100644 index 000000000..a525ca583 --- /dev/null +++ b/icicle/curves/bn254/msm.h @@ -0,0 +1,62 @@ + +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + + +#include +#include +#include +// msm.h + +#ifndef _BN254_MSM_H +#define _BN254_MSM_H + +#ifdef __cplusplus +extern "C" { +#endif + +// Incomplete declaration of BN254 projective and affine structs +typedef struct BN254_projective_t BN254_projective_t; +typedef struct BN254_g2_projective_t BN254_g2_projective_t; +typedef struct BN254_affine_t BN254_affine_t; +typedef struct BN254_g2_affine_t BN254_g2_affine_t; +typedef struct BN254_scalar_t BN254_scalar_t; +typedef cudaStream_t CudaStream_t; + +int msm_cuda_bn254(BN254_projective_t* out, BN254_affine_t* points, + BN254_scalar_t* scalars, size_t count, size_t device_id); + +int msm_batch_cuda_bn254(BN254_projective_t* out, BN254_affine_t* points, + BN254_scalar_t* scalars, size_t batch_size, + size_t msm_size, size_t device_id); + +int commit_cuda_bn254(BN254_projective_t* d_out, BN254_scalar_t* d_scalars, + BN254_affine_t* d_points, size_t count, size_t device_id); + +int commit_batch_cuda_bn254(BN254_projective_t* d_out, BN254_scalar_t* d_scalars, + BN254_affine_t* d_points, size_t count, + size_t batch_size, size_t device_id); + +int msm_g2_cuda_bn254(BN254_g2_projective_t *out, BN254_g2_affine_t* points, BN254_scalar_t* scalars, size_t count, size_t device_id); +int msm_batch_g2_cuda_bn254(BN254_g2_projective_t* out, BN254_g2_affine_t* points, BN254_scalar_t* scalars, size_t batch_size, size_t msm_size, size_t device_id); +int commit_g2_cuda_bn254(BN254_g2_projective_t* d_out, BN254_scalar_t* d_scalars, BN254_g2_affine_t* d_points, size_t count, size_t device_id); +int commit_batch_g2_cuda_bn254(BN254_g2_projective_t* d_out, BN254_scalar_t* d_scalars, BN254_g2_affine_t* d_points, size_t count, size_t batch_size, size_t device_id, cudaStream_t stream); + +#ifdef __cplusplus +} +#endif + +#endif /* _BN254_MSM_H */ diff --git a/icicle/curves/bn254/ntt.h b/icicle/curves/bn254/ntt.h new file mode 100644 index 000000000..1841fb814 --- /dev/null +++ b/icicle/curves/bn254/ntt.h @@ -0,0 +1,68 @@ +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +#include +#include +// ntt.h + +#ifndef _BN254_NTT_H +#define _BN254_NTT_H + +#ifdef __cplusplus +extern "C" { +#endif + +// Incomplete declaration of BN254 projective and affine structs +typedef struct BN254_projective_t BN254_projective_t; +typedef struct BN254_affine_t BN254_affine_t; +typedef struct BN254_scalar_t BN254_scalar_t; + +int ntt_cuda_bn254(BN254_scalar_t *arr, uint32_t n, bool inverse, size_t decimation, size_t device_id); +int ntt_batch_cuda_bn254(BN254_scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id); + +int ecntt_cuda_bn254(BN254_projective_t *arr, uint32_t n, bool inverse, size_t device_id); +int ecntt_batch_cuda_bn254(BN254_projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id); + + +BN254_scalar_t* build_domain_cuda_bn254(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id, size_t stream); +int interpolate_scalars_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t *d_evaluations, BN254_scalar_t *d_domain, unsigned n, unsigned device_id, size_t stream); +int interpolate_scalars_batch_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t* d_evaluations, BN254_scalar_t* d_domain, unsigned n, unsigned batch_size, size_t device_id, size_t stream); +int interpolate_points_cuda_bn254(BN254_projective_t* d_out, BN254_projective_t *d_evaluations, BN254_scalar_t *d_domain, unsigned n, size_t device_id, size_t stream); +int interpolate_points_batch_cuda_bn254(BN254_projective_t* d_out, BN254_projective_t* d_evaluations, BN254_scalar_t* d_domain,unsigned n, unsigned batch_size, size_t device_id, size_t stream); +int interpolate_scalars_on_coset_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t* d_evaluations, BN254_scalar_t* d_domain, unsigned n, BN254_scalar_t* coset_powers, size_t device_id, size_t stream); +int interpolate_scalars_batch_on_coset_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t* d_evaluations, BN254_scalar_t* d_domain, unsigned n, unsigned batch_size, BN254_scalar_t* coset_powers, size_t device_id, size_t stream); +int evaluate_scalars_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t *d_coefficients, BN254_scalar_t *d_domain, unsigned domain_size, unsigned n, unsigned device_id, size_t stream); +int evaluate_scalars_batch_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t* d_coefficients, BN254_scalar_t* d_domain, unsigned domain_size,unsigned n, unsigned batch_size, size_t device_id, size_t stream); +int evaluate_points_cuda_bn254(BN254_projective_t* d_out, BN254_projective_t *d_coefficients, BN254_scalar_t *d_domain, unsigned domain_size, unsigned n, size_t device_id, size_t stream); +int evaluate_points_batch_cuda_bn254(BN254_projective_t* d_out, BN254_projective_t* d_coefficients, BN254_scalar_t* d_domain, unsigned domain_size,unsigned n, unsigned batch_size, size_t device_id, size_t stream); +int evaluate_scalars_on_coset_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t *d_coefficients, BN254_scalar_t *d_domain, unsigned domain_size,unsigned n, BN254_scalar_t *coset_powers, unsigned device_id, size_t stream); +int evaluate_scalars_on_coset_batch_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t* d_coefficients, BN254_scalar_t* d_domain, unsigned domain_size, unsigned n, unsigned batch_size, BN254_scalar_t *coset_powers, size_t device_id, size_t stream); +int evaluate_points_on_coset_cuda_bn254(BN254_projective_t* d_out, BN254_projective_t *d_coefficients, BN254_scalar_t *d_domain, unsigned domain_size,unsigned n, BN254_scalar_t *coset_powers, size_t device_id, size_t stream); +int evaluate_points_on_coset_batch_cuda_bn254(BN254_projective_t* d_out, BN254_projective_t* d_coefficients, BN254_scalar_t* d_domain, unsigned domain_size, unsigned n, unsigned batch_size, BN254_scalar_t *coset_powers, size_t device_id, size_t stream); +int reverse_order_scalars_cuda_bn254(BN254_scalar_t* arr, int n, size_t device_id, size_t stream); +int reverse_order_scalars_batch_cuda_bn254(BN254_scalar_t* arr, int n, int batch_size, size_t device_id, size_t stream); +int reverse_order_points_cuda_bn254(BN254_projective_t* arr, int n, size_t device_id, size_t stream); +int reverse_order_points_batch_cuda_bn254(BN254_projective_t* arr, int n, int batch_size, size_t device_id, size_t stream); +int add_scalars_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t* d_in1, BN254_scalar_t* d_in2, unsigned n, size_t stream); +int sub_scalars_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t* d_in1, BN254_scalar_t* d_in2, unsigned n, size_t stream); +int to_montgomery_scalars_cuda_bn254(BN254_scalar_t* d_inout, unsigned n, size_t stream); +int from_montgomery_scalars_cuda_bn254(BN254_scalar_t* d_inout, unsigned n, size_t stream); + +#ifdef __cplusplus +} +#endif + +#endif /* _BN254_NTT_H */ diff --git a/icicle/curves/bn254/params.cuh b/icicle/curves/bn254/params.cuh index d4e4fda8b..5a8d184d2 100644 --- a/icicle/curves/bn254/params.cuh +++ b/icicle/curves/bn254/params.cuh @@ -18,68 +18,70 @@ namespace PARAMS_BN254 { static constexpr storage m = {0xbe1de925, 0x620703a6, 0x09e880ae, 0x71448520, 0x68073014, 0xab074a58, 0x623a04a7, 0x54a47462}; static constexpr storage one = {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; static constexpr storage zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; + static constexpr storage montgomery_r = {0x4ffffffb, 0xac96341c, 0x9f60cd29, 0x36fc7695, 0x7879462e, 0x666ea36f, 0x9a07df2f, 0xe0a77c1}; + static constexpr storage montgomery_r_inv = {0x6db1194e, 0xdc5ba005, 0xe111ec87, 0x90ef5a9, 0xaeb85d5d, 0xc8260de4, 0x82c5551c, 0x15ebf951}; static constexpr storage_array omega = { { - {0xf0000000, 0x43e1f593, 0x79b97091, 0x2833e848, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72}, - {0x608fc9cb, 0x20cff123, 0x7c4604a5, 0xcb49c351, 0x41a91758, 0xb3c4d79d, 0x00000000, 0x00000000}, - {0x07b95a9b, 0x8b11d9ab, 0x41671f56, 0x20710ead, 0x30f81dee, 0xfb3acaee, 0x9778465c, 0x130b1711}, - {0x373428de, 0xb85a71e6, 0xaeb0337e, 0x74954d30, 0x303402b7, 0x2bfc85eb, 0x409556c0, 0x02e40daf}, - {0xf210979d, 0x8c99980c, 0x34905b4d, 0xef8f3113, 0xdf25d8e7, 0x0aeaf3e7, 0x03bfbd79, 0x27247136}, - {0x763d698f, 0x78ce6a0b, 0x1d3213ee, 0xd80396ec, 0x67a8a676, 0x035cdc75, 0xb2a13d3a, 0x26177cf2}, - {0x29bbd82a, 0x66025d34, 0xd51adad3, 0x7de451fd, 0x2391cd58, 0x75d44157, 0x67c7e8f7, 0x1a228e1f}, - {0xe58d2045, 0x9e8224f9, 0x9db56d8b, 0x8763970f, 0x6924235d, 0x002c22ca, 0x9a5b1fe5, 0x23f7a8c4}, - {0x57e32226, 0xecb0115c, 0x7986b170, 0x9de32043, 0x9ba3478e, 0xbda33f36, 0x42663c00, 0x2a98c60f}, - {0x893de19c, 0x1e7cf96f, 0x41b8ab52, 0xeee3e28d, 0xd0b69f2c, 0x7d9ef422, 0x3fb50a52, 0x213a41b9}, - {0x0984c448, 0xf08c8b53, 0xc402c42c, 0xb129e235, 0x9cd953ee, 0x06981b97, 0x54c83f3a, 0x14c28c45}, - {0x7f2bce0e, 0x637162dd, 0x60632cfd, 0x3986de3a, 0x322a13d5, 0x1d597f9b, 0x443a15cd, 0x2288f608}, - {0x4feaa40d, 0x6e4249aa, 0x55bea19d, 0xe320bcd2, 0x8a080b27, 0x46ecf54e, 0x669b23a8, 0x0be6f2f3}, - {0x5faf820e, 0x2e0df3c8, 0xf57ba925, 0x94012fad, 0xec7e04b6, 0xd4a4c3f8, 0xdada7616, 0x09b10f9e}, - {0x5ccf87c6, 0xfe7b2472, 0xbca1f36d, 0x28a9c54c, 0xa2fcbf44, 0x69b51fda, 0xaf3bccd6, 0x1e85c3d0}, - {0xe06e6104, 0x6f7b3d2c, 0x0ca7fa8b, 0xa2dae3f7, 0x7f55cccb, 0xa8ed59c6, 0x9393d41a, 0x0136f0c1}, - {0xe8be0cf9, 0x46e4b3fc, 0x26a4ec96, 0x95cac63c, 0x72c6fabd, 0xb5383490, 0x7a77e6f4, 0x0bf03fb7}, - {0xbe7fae83, 0xf1533e2d, 0x2bf2f819, 0x07fa9bc3, 0x0ae79bd3, 0x639e807e, 0xd918b4d6, 0x048a18f9}, - {0xfd994358, 0x81f47ff5, 0xa4046266, 0x82d21187, 0x4f8b37af, 0xb853f627, 0x83c8d939, 0x1d28a336}, - {0x54fd384a, 0xa10aa9d9, 0x115fb459, 0x55c89a80, 0xf2fefc7c, 0x8124e414, 0x4dcb6e29, 0x240671d5}, - {0x0198b787, 0xdec6153e, 0xe4ced161, 0xca96510d, 0x7a5aa862, 0x5be2fd37, 0xf296b11c, 0x2da73caa}, - {0x05c55d1c, 0x4dce2389, 0xfa7c4637, 0xf9a0b409, 0x536fb2aa, 0x93cb1b47, 0xf192403b, 0x119bd737}, - {0xa6e170a7, 0x052227f3, 0x497e76fa, 0x7b6d8e56, 0x2167875a, 0xaba6b5f1, 0xdf18f989, 0x0aeda119}, - {0x5bebb03f, 0x22c5804b, 0x67f59436, 0xbe1e0138, 0x3485fed1, 0x67cf2e16, 0xc78bb32e, 0x2149424c}, - {0x122289c9, 0x8c4c6154, 0xe4a315a6, 0x6b6af77a, 0x9b660726, 0xb5f15d86, 0x3d681050, 0x035c63f6}, - {0x26251593, 0x1e5382ec, 0x4d18be62, 0x06b499fe, 0xc269da43, 0x42d636d0, 0x9bc0794a, 0x19bb352a}, - {0x8c321a28, 0xcd6f38f4, 0x2c9f1792, 0x95cceb99, 0x0d152ffa, 0x0630d09e, 0x8b277331, 0x151d457a}, - {0x88590882, 0xb8dde849, 0x0e1a5d5d, 0x67cf5acd, 0x723d5c5f, 0xe1ed0cc8, 0xc8953178, 0x188c51b4} - } }; - - - static constexpr storage_array omega_inv = { { {0xf0000000, 0x43e1f593, 0x79b97091, 0x2833e848, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72}, {0x8f703636, 0x23120470, 0xfd736bec, 0x5cea24f6, 0x3fd84104, 0x048b6e19, 0xe131a029, 0x30644e72}, {0xc1bd5e80, 0x948dad4a, 0xf8170a0a, 0x52627366, 0x96afef36, 0xec9b9e2f, 0xc8c14f22, 0x2b337de1}, {0xe306460b, 0xb11509c6, 0x174efb98, 0x996dfbe1, 0x94dd508c, 0x1c6e4f45, 0x16cbbf4e, 0x21082ca2}, {0x3bb512d0, 0x3eed4c53, 0x838eeb1d, 0x9c18d51b, 0x47c0b2a9, 0x9678200d, 0x306b93d2, 0x09c532c6}, {0x118f023a, 0xdb94fb05, 0x26e324be, 0x46a6cb24, 0x49bdadf2, 0xc24cdb76, 0x5b080fca, 0x1418144d}, - {0x3562e7f0, 0xa6d3ae87, 0xc2c72417, 0x0a6892e3, 0x9928147d, 0xd8f36419, 0x34009697, 0x197d1075}, - {0x33dfc1c9, 0x2a6289b1, 0x5501716e, 0xbce46410, 0x46fc5cda, 0xf889ee59, 0x17eaabaf, 0x02322181}, - {0xcc1f5f41, 0x228296e4, 0xb64ec189, 0x33dfcfa6, 0x761eb33b, 0x540e6644, 0x92785b8f, 0x280fea34}, - {0x7da16033, 0x5c3b9077, 0x521453d1, 0x872b404c, 0x9b054370, 0xfa5f6841, 0xada992f1, 0x0d0daaff}, - {0x717d26a6, 0xecb21619, 0xd2347943, 0xa7292758, 0xdb78b96e, 0xeb5c7bb1, 0xbe49270d, 0x26b0514e}, - {0xeff004d4, 0x61f1b8d2, 0x5394943e, 0xcaec7c92, 0x3adcbc53, 0xb3054a41, 0xd4d8eb05, 0x2fd82a77}, - {0x8518ecca, 0x1cccc58c, 0x1b0344ef, 0xd468e695, 0x4501b89c, 0x60888009, 0xa35f81da, 0x0b65bab2}, - {0xef3775e2, 0xaac68b20, 0x1a8af1f5, 0xade7937c, 0x48e14944, 0x4e613e72, 0xf67b5e99, 0x07ee2fe0}, - {0x01f73bfa, 0x7a9178dd, 0x4ad38023, 0x4e6a9df1, 0xc1cf1a77, 0x186f8ba1, 0x0113aedc, 0x1c75f370}, - {0x6f0924c9, 0x825269f8, 0x321acd8d, 0x85dc2b62, 0x0cff4400, 0xc63cb6e9, 0x0d87b733, 0x0bf840f4}, - {0x9cd2abc9, 0xb9064db6, 0x35033aba, 0x21800b41, 0x284fabbf, 0x2e7cd8b7, 0x50fd23e3, 0x14fdf780}, - {0x3c5b53bd, 0x31a0e6e5, 0xad2ade0a, 0x000e1067, 0xf7740140, 0x7507f5ca, 0x4d4c1f98, 0x1faf0653}, - {0x04b80dc8, 0x21ab655a, 0x7c0bd3dc, 0xf30ae094, 0x94ded480, 0x90f19302, 0x0ee779cb, 0x13a0614f}, - {0xc5d0d45f, 0x325dbdfc, 0xdb23c86e, 0x531a0e2c, 0x79c537a7, 0xa2a71200, 0x2b0445a8, 0x2e103cac}, - {0x0eb3de4a, 0x995227ff, 0xb0f25c6e, 0x735dd808, 0x36941528, 0x990dabf7, 0xf1fe47c5, 0x19ffeb1c}, - {0x562cb6d5, 0xd61871ee, 0x8dc2c90d, 0xacd56e5a, 0x8d0d8980, 0xda46bba0, 0x92ec6935, 0x2d46308e}, - {0xa70a7c13, 0x1703a78e, 0xdd4ce698, 0xc6bc1d64, 0x5693e78e, 0xbd63b0af, 0x568a26b0, 0x1d527113}, - {0x02648ff7, 0x30b77d88, 0x5d7e4386, 0xf1a86cdd, 0x66dd8016, 0x69f57e82, 0x3aa86583, 0x11aeccf6}, - {0x0e4cebf9, 0x8c389a89, 0x1086a5f0, 0x04596644, 0x79d41b0e, 0xeb3dabcc, 0x4e649ca0, 0x2977e823}, - {0xe1ce2126, 0x3fb533e8, 0xba920fa8, 0xc4f9f250, 0xd91fa66c, 0x3b40e70b, 0x44d8f309, 0x295e48a4}, - {0xdfc40a8b, 0x52bb0a4c, 0x46112483, 0x4fb64a4b, 0x460eac6d, 0x70ffb433, 0xe671b22c, 0x193903e1}, - {0xe25ab83b, 0x44c8eb25, 0x9d2ac154, 0xc66b9e1b, 0xb17a4c68, 0xc023ff24, 0xb5e12a84, 0x18f27f93} + {0xba9d1811, 0x9d0e470c, 0xb6f24c79, 0x1dcb5564, 0xe85943e0, 0xdf5ce19c, 0xad310991, 0x16e73dfd}, + {0x74a57a76, 0xc8936191, 0x6750f230, 0x61794254, 0x9f36ffb0, 0xf086204a, 0xa6148404, 0x07b0c561}, + {0x470157ce, 0x893a7fa1, 0xfc782d75, 0xe8302a41, 0xdd9b0675, 0xffc02c0e, 0xf6e72f5b, 0x0f1ded1e}, + {0xbc2e5912, 0x11f995e1, 0xa8d2d7ab, 0x39ba79c0, 0xb08771e3, 0xebbebc2b, 0x7017a420, 0x06fd19c1}, + {0x769a2ee2, 0xd00a58f9, 0x7494f0ca, 0xb8c12c17, 0xa5355d71, 0xb4027fd7, 0x99c5042b, 0x027a3584}, + {0x0042d43a, 0x1c477572, 0x6f039bb9, 0x76f169c7, 0xfd5a90a9, 0x01ddd073, 0xde2fd10f, 0x0931d596}, + {0x9bbdd310, 0x4aa49b8d, 0x8e3a2d76, 0xd31bf3e2, 0x78b2667b, 0x001deac8, 0xb869ae62, 0x006fab49}, + {0x617c6e85, 0xadaa01c2, 0x7420aae6, 0xb4a93ee1, 0x0ddca8a8, 0x1f4e51b8, 0xcdd9e481, 0x2d965651}, + {0x4e26ecfb, 0xa93458fd, 0x4115a009, 0x022a2a2d, 0x69ec2bd0, 0x017171fa, 0x5941dc91, 0x2d1ba66f}, + {0xdaac43b7, 0xd1628ba2, 0xe4347e7d, 0x16c8601d, 0xe081dcff, 0x649abebd, 0x5981ed45, 0x00eeb2cb}, + {0xce8f58e5, 0x276e5858, 0x5655210e, 0x0512eca9, 0xe70e61f3, 0xc3708cc6, 0xa7d74902, 0x1bf82deb}, + {0x7dcdc0e0, 0x84c6bfa5, 0x13f4d1bd, 0xc57088ff, 0xb5b95e4d, 0x5c0176fb, 0x3a8d46c1, 0x19ddbcaf}, + {0x613f6cbd, 0x5c1d597f, 0x8357473a, 0x30525841, 0x968e4915, 0x51829353, 0x844bca52, 0x2260e724}, + {0x53337857, 0x53422da9, 0xdbed349f, 0xac616632, 0x06d1e303, 0x27508aba, 0x0a0ed063, 0x26125da1}, + {0xfcd0b523, 0xb2c87885, 0xca5a5ce3, 0x58f50577, 0x8598fc8c, 0x4222150e, 0xae2bdd1a, 0x1ded8980}, + {0xa219447e, 0xa76dde56, 0x359eebbb, 0xec1a1f05, 0x8be08215, 0xcda0ceb6, 0xb1f8d9a7, 0x1ad92f46}, + {0xab80c59d, 0xb54d4506, 0x22dd991f, 0x5680c640, 0xbc23a139, 0x6b7bcf70, 0x5ab4c74d, 0x0210fe63}, + {0xe32b045b, 0x1c25f1e3, 0x2e832696, 0x145e0db8, 0x71c6441f, 0x852e2a03, 0x845d50d2, 0x0c9fabc7}, + {0xb878331a, 0xeccd4f3e, 0x8dc6d26e, 0x7b26b748, 0xd9130cd4, 0xa19b0361, 0x326341ef, 0x2a734ebb}, + {0x2f4e9212, 0x1c79bd57, 0x3d68f9ae, 0x605b52b6, 0xb8d89d4a, 0x0113eff9, 0xf1ff73b2, 0x1067569a}, + {0x80928c44, 0x034afc45, 0xf6437da2, 0xb4823532, 0x6dc6e364, 0x5f256a9f, 0xb363ebe8, 0x049ae702}, + {0x725b19f0, 0x9bd61b6e, 0x41112ed4, 0x402d111e, 0x8ef62abc, 0x00e0a7eb, 0xa58a7e85, 0x2a3c09f0} + } }; + + + static constexpr storage_array omega_inv = { { + {0xf0000000, 0x43e1f593, 0x79b97091, 0x2833e848, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72}, + {0x608fc9cb, 0x20cff123, 0x7c4604a5, 0xcb49c351, 0x41a91758, 0xb3c4d79d, 0x00000000, 0x00000000}, + {0x07b95a9b, 0x8b11d9ab, 0x41671f56, 0x20710ead, 0x30f81dee, 0xfb3acaee, 0x9778465c, 0x130b1711}, + {0x373428de, 0xb85a71e6, 0xaeb0337e, 0x74954d30, 0x303402b7, 0x2bfc85eb, 0x409556c0, 0x02e40daf}, + {0xf210979d, 0x8c99980c, 0x34905b4d, 0xef8f3113, 0xdf25d8e7, 0x0aeaf3e7, 0x03bfbd79, 0x27247136}, + {0x763d698f, 0x78ce6a0b, 0x1d3213ee, 0xd80396ec, 0x67a8a676, 0x035cdc75, 0xb2a13d3a, 0x26177cf2}, + {0xc64427d7, 0xdddf985f, 0xa49e95bd, 0xaa4f964a, 0x5def8b04, 0x427c045f, 0x7969b732, 0x1641c053}, + {0x0329f5d6, 0x692c553d, 0x8712848a, 0xa54cf8c6, 0x38e2b5e6, 0x64751ad9, 0x7422fad3, 0x204bd327}, + {0xaf6b3e4e, 0x52f26c0f, 0xf0bcc0c8, 0x4c277a07, 0xe4fcfcab, 0x546875d5, 0xaa9995b3, 0x09d8f821}, + {0xb2e5cc71, 0xcaa2e1e9, 0x6e43404e, 0xed42b68e, 0x7a2c7f0a, 0x6ed80915, 0xde3c86d6, 0x1c4042c7}, + {0x579d71ae, 0x20a3a65d, 0x0adc4420, 0xfd7efed8, 0xfddabf54, 0x3bb6dcd7, 0xbc73d07b, 0x0fa9bb21}, + {0xc79e0e57, 0xb6f70f8d, 0xa04e05ac, 0x269d3fde, 0x2ba088d9, 0xcf2e371c, 0x11b88d9c, 0x1af864d2}, + {0xabd95dc9, 0x3b0b205a, 0x978188ca, 0xc8df74fa, 0x6a1cb6c8, 0x08e124db, 0xbfac6104, 0x1670ed58}, + {0x641c8410, 0xf8eee934, 0x677771c0, 0xf40976b0, 0x558e6e8c, 0x11680d42, 0x06e7e9e9, 0x281c036f}, + {0xb2dbc0b4, 0xc92a742f, 0x4d384e68, 0xc3f02842, 0x2fa43d0d, 0x22701b6f, 0xe4590b37, 0x05d33766}, + {0x02d842d4, 0x922d5ac8, 0xc830e4c6, 0x91126414, 0x082f37e0, 0xe92338c0, 0x7fe704e8, 0x0b5d56b7}, + {0xd96f0d22, 0x20e75251, 0x6bd4e8c9, 0xc01c7f08, 0xf9dd50c4, 0x37d8b00b, 0xc43ca872, 0x244cf010}, + {0x66c5174c, 0x7a823174, 0x22d5ad70, 0x7dbe118c, 0x111119c5, 0xf8d7c71d, 0x83780e87, 0x036853f0}, + {0xca535321, 0xd98f9924, 0xe66e6c81, 0x22dbc0ef, 0x664ae1b7, 0xa15cf806, 0xa314fb67, 0x06e402c0}, + {0xe26c91f3, 0x0852a8fd, 0x3baca626, 0x521f45cb, 0x2c51bfca, 0xab6473bc, 0x2100895f, 0x100c332d}, + {0xa376d0f0, 0xf5fac783, 0x940797d3, 0x50fd246e, 0x145f5278, 0xab14ecc1, 0x41091b14, 0x19c6dfb8}, + {0x7faa1396, 0x43dc52e2, 0x4beced23, 0xd437be9d, 0x6d3c38c3, 0xecc11e9c, 0x0c74a876, 0x2eb58439}, + {0xd69ca83b, 0x811b03e7, 0xa1a6eadf, 0x126a786b, 0x4e2b8e61, 0x1dd75c9f, 0xbda6792b, 0x2165a1a5}, + {0x110b737b, 0x02e1d4d1, 0xb323a164, 0x7be1488d, 0x9cd06163, 0xa334d317, 0xdb50e9cd, 0x2710c370}, + {0x9550fe47, 0x45d2f3cb, 0xf6a8efc4, 0x5f43327b, 0xe993ee18, 0x5bcd0d50, 0xb21de952, 0x27f035bd}, + {0x232e3983, 0x1d63cbae, 0xaa1b58e2, 0xac815161, 0x6aeb019e, 0x531f42a5, 0x03ca2ef5, 0x2dcd51d9}, + {0x980db869, 0xa8b64ba8, 0xc9718f6c, 0x4c787f72, 0x15d27ced, 0x7746a25a, 0x435a46e9, 0x110bf78f}, + {0x9d18157e, 0x72394277, 0xfd399d5d, 0xec9d51f8, 0x49d5387f, 0x6117635d, 0x9c229cd5, 0x01b77519} } }; @@ -128,6 +130,9 @@ namespace PARAMS_BN254 { static constexpr storage m = {0x19bf90e5, 0x6f3aed8a, 0x67cd4c08, 0xae965e17, 0x68073013, 0xab074a58, 0x623a04a7, 0x54a47462}; static constexpr storage one = {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; static constexpr storage zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; + static constexpr storage montgomery_r = {0xc58f0d9d, 0xd35d438d, 0xf5c70b3d, 0xa78eb28, 0x7879462c, 0x666ea36f, 0x9a07df2f, 0xe0a77c1}; + static constexpr storage montgomery_r_inv = {0x14afa37, 0xed84884a, 0x278edf8, 0xeb202285, 0xb74492d9, 0xcf63e9cf, 0x59e5c639, 0x2e671571}; + // i^2, the square of the imaginary unit for the extension field static constexpr uint32_t i_squared = 1; // true if i^2 is negative diff --git a/icicle/curves/bn254/projective.cu b/icicle/curves/bn254/projective.cu index c95f48938..e4f56e5e6 100644 --- a/icicle/curves/bn254/projective.cu +++ b/icicle/curves/bn254/projective.cu @@ -16,4 +16,4 @@ extern "C" bool eq_g2_bn254(BN254::g2_projective_t *point1, BN254::g2_projective !((point1->x == BN254::g2_point_field_t::zero()) && (point1->y == BN254::g2_point_field_t::zero()) && (point1->z == BN254::g2_point_field_t::zero())) && !((point2->x == BN254::g2_point_field_t::zero()) && (point2->y == BN254::g2_point_field_t::zero()) && (point2->z == BN254::g2_point_field_t::zero())); } -#endif \ No newline at end of file +#endif diff --git a/icicle/curves/bn254/ve_mod_mult.cu b/icicle/curves/bn254/ve_mod_mult.cu index 6acef1fab..c467bda94 100644 --- a/icicle/curves/bn254/ve_mod_mult.cu +++ b/icicle/curves/bn254/ve_mod_mult.cu @@ -51,6 +51,21 @@ extern "C" int32_t vec_mod_mult_scalar_bn254(BN254::scalar_t *inout, } } +extern "C" int32_t vec_mod_mult_device_scalar_bn254( + BN254::scalar_t *inout, + BN254::scalar_t *scalar_vec, + size_t n_elements, + size_t device_id +) { + try { + vector_mod_mult_device(scalar_vec, inout, inout, n_elements); + return CUDA_SUCCESS; + } catch (const std::runtime_error &ex) { + printf("error %s", ex.what()); // TODO: error code and message + return -1; + } +} + extern "C" int32_t matrix_vec_mod_mult_bn254(BN254::scalar_t *matrix_flattened, BN254::scalar_t *input, BN254::scalar_t *output, diff --git a/icicle/curves/bn254/ve_mod_mult.h b/icicle/curves/bn254/ve_mod_mult.h new file mode 100644 index 000000000..6b974118e --- /dev/null +++ b/icicle/curves/bn254/ve_mod_mult.h @@ -0,0 +1,41 @@ +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +#include +#include +// ve_mod_mult.h + +#ifndef _BN254_VEC_MULT_H +#define _BN254_VEC_MULT_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct BN254_projective_t BN254_projective_t; +typedef struct BN254_scalar_t BN254_scalar_t; + +int32_t vec_mod_mult_point_bn254(BN254_projective_t *inout, BN254_scalar_t *scalar_vec, size_t n_elments, size_t device_id); +int32_t vec_mod_mult_scalar_bn254(BN254_scalar_t *inout, BN254_scalar_t *scalar_vec, size_t n_elments, size_t device_id); +int32_t vec_mod_mult_device_scalar_bn254(BN254_scalar_t *inout, BN254_scalar_t *scalar_vec, size_t n_elements, size_t device_id); +int32_t matrix_vec_mod_mult_bn254(BN254_scalar_t *matrix_flattened, BN254_scalar_t *input, BN254_scalar_t *output, size_t n_elments, size_t device_id); + + +#ifdef __cplusplus +} +#endif + +#endif /* _BN254_VEC_MULT_H */ diff --git a/icicle/curves/curve_template/lde.cu b/icicle/curves/curve_template/lde.cu index 82240aeca..ef9d892c6 100644 --- a/icicle/curves/curve_template/lde.cu +++ b/icicle/curves/curve_template/lde.cu @@ -24,12 +24,12 @@ extern "C" ${CURVE_NAME_U}::scalar_t* build_domain_cuda_${CURVE_NAME_L}(uint32_t } } -extern "C" int ntt_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) +extern "C" int ntt_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t *arr, uint32_t n, bool inverse, Decimation decimation, size_t device_id = 0, cudaStream_t stream = 0) { try { cudaStreamCreate(&stream); - return ntt_end2end_template<${CURVE_NAME_U}::scalar_t,${CURVE_NAME_U}::scalar_t>(arr, n, inverse, stream); // TODO: pass device_id + return ntt_end2end_template<${CURVE_NAME_U}::scalar_t,${CURVE_NAME_U}::scalar_t>(arr, n, inverse, decimation, stream); // TODO: pass device_id } catch (const std::runtime_error &ex) { @@ -39,12 +39,12 @@ extern "C" int ntt_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t *arr, uint32_t } } -extern "C" int ecntt_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) +extern "C" int ecntt_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t *arr, uint32_t n, bool inverse, Decimation decimation, size_t device_id = 0, cudaStream_t stream = 0) { try { cudaStreamCreate(&stream); - return ntt_end2end_template<${CURVE_NAME_U}::projective_t,${CURVE_NAME_U}::scalar_t>(arr, n, inverse, stream); // TODO: pass device_id + return ntt_end2end_template<${CURVE_NAME_U}::projective_t,${CURVE_NAME_U}::scalar_t>(arr, n, inverse, decimation, stream); // TODO: pass device_id } catch (const std::runtime_error &ex) { @@ -85,7 +85,8 @@ extern "C" int interpolate_scalars_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_ { try { - return interpolate(d_out, d_evaluations, d_domain, n, stream); + ${CURVE_NAME_U}::scalar_t* _null = nullptr; + return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -99,8 +100,37 @@ extern "C" int interpolate_scalars_batch_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::s { try { + ${CURVE_NAME_U}::scalar_t* _null = nullptr; cudaStreamCreate(&stream); - return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream); + return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream); + } + catch (const std::runtime_error &ex) + { + printf("error %s", ex.what()); + return -1; + } +} + +extern "C" int interpolate_scalars_on_coset_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t* d_out, ${CURVE_NAME_U}::scalar_t *d_evaluations, ${CURVE_NAME_U}::scalar_t *d_domain, unsigned n, ${CURVE_NAME_U}::scalar_t *coset_powers, unsigned device_id = 0, cudaStream_t stream = 0) +{ + try + { + return interpolate(d_out, d_evaluations, d_domain, n, true, coset_powers, stream); + } + catch (const std::runtime_error &ex) + { + printf("error %s", ex.what()); + return -1; + } +} + +extern "C" int interpolate_scalars_batch_on_coset_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t* d_out, ${CURVE_NAME_U}::scalar_t* d_evaluations, ${CURVE_NAME_U}::scalar_t* d_domain, unsigned n, + unsigned batch_size, ${CURVE_NAME_U}::scalar_t* coset_powers, size_t device_id = 0, cudaStream_t stream = 0) +{ + try + { + cudaStreamCreate(&stream); + return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, true, coset_powers, stream); } catch (const std::runtime_error &ex) { @@ -113,7 +143,8 @@ extern "C" int interpolate_points_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projecti { try { - return interpolate(d_out, d_evaluations, d_domain, n, stream); + ${CURVE_NAME_U}::scalar_t* _null = nullptr; + return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -127,8 +158,9 @@ extern "C" int interpolate_points_batch_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::pr { try { + ${CURVE_NAME_U}::scalar_t* _null = nullptr; cudaStreamCreate(&stream); - return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream); + return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -268,6 +300,7 @@ extern "C" int reverse_order_scalars_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scala uint32_t logn = uint32_t(log(n) / log(2)); cudaStreamCreate(&stream); reverse_order(arr, n, logn, stream); + cudaStreamSynchronize(stream); return 0; } catch (const std::runtime_error &ex) diff --git a/icicle/curves/curve_template/msm.cu b/icicle/curves/curve_template/msm.cu index bbfe0a368..9a8ce6f95 100644 --- a/icicle/curves/curve_template/msm.cu +++ b/icicle/curves/curve_template/msm.cu @@ -11,7 +11,7 @@ int msm_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t *out, ${CURVE_NAME_U} { try { - large_msm<${CURVE_NAME_U}::scalar_t, ${CURVE_NAME_U}::projective_t, ${CURVE_NAME_U}::affine_t>(scalars, points, count, out, false, stream); + large_msm<${CURVE_NAME_U}::scalar_t, ${CURVE_NAME_U}::projective_t, ${CURVE_NAME_U}::affine_t>(scalars, points, count, out, false, false, stream); return CUDA_SUCCESS; } catch (const std::runtime_error &ex) @@ -52,7 +52,7 @@ extern "C" int msm_batch_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t* out { try { - large_msm(d_scalars, d_points, count, d_out, true, stream); + large_msm(d_scalars, d_points, count, d_out, true, false, stream); cudaStreamSynchronize(stream); return 0; } diff --git a/icicle/curves/curve_template/projective.cu b/icicle/curves/curve_template/projective.cu index 32ba4e247..23190f046 100644 --- a/icicle/curves/curve_template/projective.cu +++ b/icicle/curves/curve_template/projective.cu @@ -16,4 +16,4 @@ extern "C" bool eq_g2_${CURVE_NAME_L}(${CURVE_NAME_U}::g2_projective_t *point1, !((point1->x == ${CURVE_NAME_U}::g2_point_field_t::zero()) && (point1->y == ${CURVE_NAME_U}::g2_point_field_t::zero()) && (point1->z == ${CURVE_NAME_U}::g2_point_field_t::zero())) && !((point2->x == ${CURVE_NAME_U}::g2_point_field_t::zero()) && (point2->y == ${CURVE_NAME_U}::g2_point_field_t::zero()) && (point2->z == ${CURVE_NAME_U}::g2_point_field_t::zero())); } -#endif \ No newline at end of file +#endif diff --git a/icicle/primitives/extension_field.cuh b/icicle/primitives/extension_field.cuh index acdbc3543..28682b2b9 100644 --- a/icicle/primitives/extension_field.cuh +++ b/icicle/primitives/extension_field.cuh @@ -14,24 +14,15 @@ template class ExtensionField { FWide real; FWide imaginary; - ExtensionField HOST_DEVICE_INLINE get_lower() { - return ExtensionField { real.get_lower(), imaginary.get_lower() }; + friend HOST_DEVICE_INLINE ExtensionWide operator+(ExtensionWide xs, const ExtensionWide& ys) { + return ExtensionWide { xs.real + ys.real, xs.imaginary + ys.imaginary }; } - - ExtensionField HOST_DEVICE_INLINE get_higher_with_slack() { - return ExtensionField { real.get_higher_with_slack(), imaginary.get_higher_with_slack() }; + + friend HOST_DEVICE_INLINE ExtensionWide operator-(ExtensionWide xs, const ExtensionWide& ys) { + return ExtensionWide { xs.real - ys.real, xs.imaginary - ys.imaginary }; } }; - friend HOST_DEVICE_INLINE ExtensionWide operator+(ExtensionWide xs, const ExtensionWide& ys) { - return ExtensionField { xs.real + ys.real, xs.imaginary + ys.imaginary }; - } - - // an incomplete impl that assumes that xs > ys - friend HOST_DEVICE_INLINE ExtensionWide operator-(ExtensionWide xs, const ExtensionWide& ys) { - return ExtensionField { xs.real - ys.real, xs.imaginary - ys.imaginary }; - } - public: typedef Field FF; static constexpr unsigned TLC = 2 * CONFIG::limbs_count; @@ -55,13 +46,12 @@ template class ExtensionField { return ExtensionField { FF { CONFIG::g2_gen_y_re }, FF { CONFIG::g2_gen_y_im } }; } - static HOST_INLINE ExtensionField rand_host() { return ExtensionField { FF::rand_host(), FF::rand_host() }; } - template static constexpr HOST_DEVICE_INLINE ExtensionField reduce(const ExtensionField &xs) { - return ExtensionField { FF::reduce(&xs.real), FF::reduce(&xs.imaginary) }; + template static constexpr HOST_DEVICE_INLINE ExtensionField sub_modulus(const ExtensionField &xs) { + return ExtensionField { FF::sub_modulus(&xs.real), FF::sub_modulus(&xs.imaginary) }; } friend std::ostream& operator<<(std::ostream& os, const ExtensionField& xs) { @@ -79,21 +69,22 @@ template class ExtensionField { template static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const ExtensionField& ys) { - FWide real_prod = FF::mul_wide(xs.real * ys.real); - FWide imaginary_prod = FF::mul_wide(xs.imaginary * ys.imaginary); + FWide real_prod = FF::mul_wide(xs.real, ys.real); + FWide imaginary_prod = FF::mul_wide(xs.imaginary, ys.imaginary); FWide prod_of_sums = FF::mul_wide(xs.real + xs.imaginary, ys.real + ys.imaginary); - FWide i_sq_times_im = FF::mul_unsigned(imaginary_prod); - i_sq_times_im = CONFIG::i_squared_is_negative ? FF::neg(i_sq_times_im) : i_sq_times_im; - return ExtensionField { real_prod + i_sq_times_im, prod_of_sums - real_prod - imaginary_prod }; + FWide i_sq_times_im = FF::template mul_unsigned(imaginary_prod); + i_sq_times_im = CONFIG::i_squared_is_negative ? FWide::neg(i_sq_times_im) : i_sq_times_im; + return ExtensionWide { real_prod + i_sq_times_im, prod_of_sums - real_prod - imaginary_prod }; + } + + template + static constexpr HOST_DEVICE_INLINE ExtensionField reduce(const ExtensionWide& xs) { + return ExtensionField { FF::template reduce(xs.real), FF::template reduce(xs.imaginary) }; } friend HOST_DEVICE_INLINE ExtensionField operator*(const ExtensionField& xs, const ExtensionField& ys) { - FF real_prod = xs.real * ys.real; - FF imaginary_prod = xs.imaginary * ys.imaginary; - FF prod_of_sums = (xs.real + xs.imaginary) * (ys.real + ys.imaginary); - FF i_sq_times_im = FF::template mul_unsigned(imaginary_prod); - i_sq_times_im = CONFIG::i_squared_is_negative ? FF::neg(i_sq_times_im) : i_sq_times_im; - return ExtensionField { real_prod + i_sq_times_im, prod_of_sums - real_prod - imaginary_prod }; + ExtensionWide xy = mul_wide(xs, ys); + return reduce(xy); } friend HOST_DEVICE_INLINE bool operator==(const ExtensionField& xs, const ExtensionField& ys) { @@ -104,14 +95,16 @@ template class ExtensionField { return !(xs == ys); } - template - static constexpr HOST_DEVICE_INLINE ExtensionField mul_const(const ExtensionField &xs) { - constexpr uint32_t mul_real = mutliplier.real.limbs_storage.limbs[0]; - constexpr uint32_t mul_imaginary = mutliplier.imaginary.limbs_storage.limbs[0]; - FF real_prod = FF::template mul_unsigned(xs.real); - FF imaginary_prod = FF::template mul_unsigned(xs.imaginary); - FF re_im = FF::template mul_unsigned(xs.imaginary); - FF im_re = FF::template mul_unsigned(xs.real); + template + static HOST_DEVICE_INLINE ExtensionField mul_const(const ExtensionField &xs) { + static constexpr FF mul_real = multiplier.real; + static constexpr FF mul_imaginary = multiplier.imaginary; + const FF xs_real = xs.real; + const FF xs_imaginary = xs.imaginary; + FF real_prod = FF::template mul_const(xs_real); + FF imaginary_prod = FF::template mul_const(xs_imaginary); + FF re_im = FF::template mul_const(xs_imaginary); + FF im_re = FF::template mul_const(xs_real); FF i_sq_times_im = FF::template mul_unsigned(imaginary_prod); i_sq_times_im = CONFIG::i_squared_is_negative ? FF::neg(i_sq_times_im) : i_sq_times_im; return ExtensionField { real_prod + i_sq_times_im, re_im + im_re }; @@ -142,8 +135,10 @@ template class ExtensionField { // inverse assumes that xs is nonzero static constexpr HOST_DEVICE_INLINE ExtensionField inverse(const ExtensionField& xs) { ExtensionField xs_conjugate = { xs.real, FF::neg(xs.imaginary) }; + FF i_sq_times_im = FF::template mul_unsigned(FF::sqr(xs.imaginary)); + i_sq_times_im = CONFIG::i_squared_is_negative ? FF::neg(i_sq_times_im) : i_sq_times_im; // TODO: wide here - FF xs_norm_squared = FF::sqr(xs.real) + FF::sqr(xs.imaginary); + FF xs_norm_squared = FF::sqr(xs.real) - i_sq_times_im; return xs_conjugate * ExtensionField { FF::inverse(xs_norm_squared), FF::zero() }; } }; diff --git a/icicle/primitives/field.cuh b/icicle/primitives/field.cuh index 4bff9cebd..11186e896 100644 --- a/icicle/primitives/field.cuh +++ b/icicle/primitives/field.cuh @@ -44,48 +44,55 @@ template class Field { } static HOST_INLINE Field omega(uint32_t logn) { - if (logn == 0) { - return Field { CONFIG::one }; - } + if (logn == 0) { + return Field { CONFIG::one }; + } - if (logn > CONFIG::omegas_count) { - throw std::invalid_argument( "Field: Invalid omega index" ); - } + if (logn > CONFIG::omegas_count) { + throw std::invalid_argument( "Field: Invalid omega index" ); + } - storage_array const omega = CONFIG::omega; - return Field { omega.storages[logn-1] }; + storage_array const omega = CONFIG::omega; + return Field { omega.storages[logn-1] }; } static HOST_INLINE Field omega_inv(uint32_t logn) { - if (logn == 0) { - return Field { CONFIG::one }; - } + if (logn == 0) { + return Field { CONFIG::one }; + } - if (logn > CONFIG::omegas_count) { - throw std::invalid_argument( "Field: Invalid omega_inv index" ); - } + if (logn > CONFIG::omegas_count) { + throw std::invalid_argument( "Field: Invalid omega_inv index" ); + } - storage_array const omega_inv = CONFIG::omega_inv; - return Field { omega_inv.storages[logn-1] }; + storage_array const omega_inv = CONFIG::omega_inv; + return Field { omega_inv.storages[logn-1] }; } - + static HOST_INLINE Field inv_log_size(uint32_t logn) { - if (logn == 0) { - return Field { CONFIG::one }; - } - - if (logn > CONFIG::omegas_count) { - throw std::invalid_argument( "Field: Invalid inv index" ); - } + if (logn == 0) { + return Field { CONFIG::one }; + } - storage_array const inv = CONFIG::inv; - return Field { inv.storages[logn-1] }; + if (logn > CONFIG::omegas_count) { + throw std::invalid_argument( "Field: Invalid inv index" ); + } + storage_array const inv = CONFIG::inv; + return Field { inv.storages[logn-1] }; } static constexpr HOST_DEVICE_INLINE Field modulus() { return Field { CONFIG::modulus }; } + static constexpr HOST_DEVICE_INLINE Field montgomery_r() { + return Field { CONFIG::montgomery_r }; + } + + static constexpr HOST_DEVICE_INLINE Field montgomery_r_inv() { + return Field { CONFIG::montgomery_r_inv }; + } + // private: typedef storage ff_storage; typedef storage<2*TLC> ff_wide_storage; @@ -95,44 +102,63 @@ template class Field { struct Wide { ff_wide_storage limbs_storage; - Field HOST_DEVICE_INLINE get_lower() { + static constexpr Field HOST_DEVICE_INLINE get_lower(const Wide &xs) { Field out{}; #ifdef __CUDA_ARCH__ #pragma unroll #endif for (unsigned i = 0; i < TLC; i++) - out.limbs_storage.limbs[i] = limbs_storage.limbs[i]; + out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i]; return out; } - Field HOST_DEVICE_INLINE get_higher_with_slack() { + static constexpr Field HOST_DEVICE_INLINE get_higher_with_slack(const Wide &xs) { Field out{}; #ifdef __CUDA_ARCH__ #pragma unroll #endif for (unsigned i = 0; i < TLC; i++) { #ifdef __CUDA_ARCH__ - out.limbs_storage.limbs[i] = __funnelshift_lc(limbs_storage.limbs[i + TLC - 1], limbs_storage.limbs[i + TLC], slack_bits); + out.limbs_storage.limbs[i] = __funnelshift_lc(xs.limbs_storage.limbs[i + TLC - 1], xs.limbs_storage.limbs[i + TLC], slack_bits); #else - out.limbs_storage.limbs[i] = (limbs_storage.limbs[i + TLC] << slack_bits) + (limbs_storage.limbs[i + TLC - 1] >> (32 - slack_bits)); + out.limbs_storage.limbs[i] = (xs.limbs_storage.limbs[i + TLC] << slack_bits) + (xs.limbs_storage.limbs[i + TLC - 1] >> (32 - slack_bits)); #endif } return out; } - }; - friend HOST_DEVICE_INLINE Wide operator+(Wide xs, const Wide& ys) { - Wide rs = {}; - add_limbs(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage); - return rs; - } + template static constexpr HOST_DEVICE_INLINE Wide sub_modulus_squared(const Wide &xs) { + if (REDUCTION_SIZE == 0) + return xs; + const ff_wide_storage modulus = get_modulus_squared(); + Wide rs = {}; + return sub_limbs(xs.limbs_storage, modulus, rs.limbs_storage) ? xs : rs; + } - // an incomplete impl that assumes that xs > ys - friend HOST_DEVICE_INLINE Wide operator-(Wide xs, const Wide& ys) { - Wide rs = {}; - sub_limbs(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage); - return rs; - } + template + static constexpr HOST_DEVICE_INLINE Wide neg(const Wide& xs) { + const ff_wide_storage modulus = get_modulus_squared(); + Wide rs = {}; + sub_limbs(modulus, xs.limbs_storage, rs.limbs_storage); + return rs; + } + + friend HOST_DEVICE_INLINE Wide operator+(Wide xs, const Wide& ys) { + Wide rs = {}; + add_limbs(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage); + return sub_modulus_squared<1>(rs); + } + + friend HOST_DEVICE_INLINE Wide operator-(Wide xs, const Wide& ys) { + Wide rs = {}; + uint32_t carry = sub_limbs(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage); + if (carry == 0) + return rs; + const ff_wide_storage modulus = get_modulus_squared<1>(); + add_limbs(rs.limbs_storage, modulus, rs.limbs_storage); + return rs; + } + }; // return modulus template static constexpr HOST_DEVICE_INLINE ff_storage get_modulus() { @@ -232,6 +258,14 @@ template class Field { return CARRY_OUT ? carry : 0; } + static constexpr HOST_INLINE uint32_t sub_limbs_partial_host(uint32_t* x, uint32_t* y, uint32_t* r, uint32_t num_limbs) { + uint32_t carry = 0; + host_math::carry_chain<2 * TLC, false, true> chain; + for (unsigned i = 0; i < num_limbs; i++) + r[i] = chain.sub(x[i], y[i], carry); + return carry; + } + template static constexpr HOST_DEVICE_INLINE uint32_t add_limbs(const T &xs, const T &ys, T &rs) { #ifdef __CUDA_ARCH__ return add_sub_limbs_device(xs, ys, rs); @@ -256,7 +290,17 @@ template class Field { } } + static DEVICE_INLINE void mul_n_msb(uint32_t *acc, const uint32_t *a, uint32_t bi, size_t n = TLC, size_t start_i = 0) { + #pragma unroll + for (size_t i = start_i; i < n; i += 2) { + acc[i] = ptx::mul_lo(a[i], bi); + acc[i + 1] = ptx::mul_hi(a[i], bi); + } + } + static DEVICE_INLINE void cmad_n(uint32_t *acc, const uint32_t *a, uint32_t bi, size_t n = TLC) { + // multiply scalar by vector + // acc = acc + bi*A[::2] acc[0] = ptx::mad_lo_cc(a[0], bi, acc[0]); acc[1] = ptx::madc_hi_cc(a[0], bi, acc[1]); #pragma unroll @@ -266,7 +310,21 @@ template class Field { } } + static DEVICE_INLINE void cmad_n_msb(uint32_t *acc, const uint32_t *a, uint32_t bi, size_t n = TLC, size_t a_start_idx=0) { + // multiply scalar by vector + // acc = acc + bi*A[::2] + acc[a_start_idx] = ptx::mad_lo_cc(a[a_start_idx], bi, acc[a_start_idx]); + acc[a_start_idx + 1] = ptx::madc_hi_cc(a[a_start_idx], bi, acc[a_start_idx + 1]); + #pragma unroll + for (size_t i = a_start_idx + 2; i < n; i += 2) { + acc[i] = ptx::madc_lo_cc(a[i], bi, acc[i]); + acc[i + 1] = ptx::madc_hi_cc(a[i], bi, acc[i + 1]); + } + } + static DEVICE_INLINE void mad_row(uint32_t *odd, uint32_t *even, const uint32_t *a, uint32_t bi, size_t n = TLC) { + // odd = odd + bi*A + // even = even + bi*A cmad_n(odd, a + 1, bi, n - 2); odd[n - 2] = ptx::madc_lo_cc(a[n - 1], bi, 0); odd[n - 1] = ptx::madc_hi(a[n - 1], bi, 0); @@ -274,6 +332,16 @@ template class Field { odd[n - 1] = ptx::addc(odd[n - 1], 0); } + static DEVICE_INLINE void mad_row_msb(uint32_t *odd, uint32_t *even, const uint32_t *a, uint32_t bi, size_t n = TLC, size_t a_start_idx = 0) { + // odd = odd + bi*A + // even = even + bi*A + cmad_n_msb(odd, a + 1, bi, n - 2, a_start_idx - 1); + odd[n - 2] = ptx::madc_lo_cc(a[n - 1], bi, 0); + odd[n - 1] = ptx::madc_hi(a[n - 1], bi, 0); + cmad_n_msb(even, a, bi, n, a_start_idx); + odd[n - 1] = ptx::addc(odd[n - 1], 0); + } + static DEVICE_INLINE void multiply_raw_device(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) { const uint32_t *a = as.limbs; const uint32_t *b = bs.limbs; @@ -295,13 +363,280 @@ template class Field { even[i + 1] = ptx::addc(even[i + 1], 0); } + static DEVICE_INLINE void mult_no_carry(uint32_t a, uint32_t b, uint32_t *r) { + r[0] = ptx::mul_lo(a, b); + r[1] = ptx::mul_hi(a, b); + } + + static DEVICE_INLINE void ingo_multiply_raw_device(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) { + const uint32_t *a = as.limbs; + const uint32_t *b = bs.limbs; + uint32_t *r = rs.limbs; + uint32_t i, j; + uint32_t *even = rs.limbs; + __align__(8) uint32_t odd[2 * TLC]; + for (uint32_t i = 0; i < 2 * TLC; i++) + { + even[i] = 0; + odd[i] = 0; + } + // first row special case, no carry in no carry out. split to non parts, even and odd. + for (i = 0; i < TLC - 1; i+=2 ) + { + mult_no_carry(b[0], a[i], &even[i]); + mult_no_carry(b[0], a[i + 1], &odd[i]); + } + + // doing two rows at one loop + for (i = 1; i < TLC - 1; i+=2) + { + // odd bi's + // multiply accumulate even part of new row with odd part prev row (needs a carry) + // // j = 0, no carry in, only carry out + odd[i - 1] = ptx::mad_lo_cc(a[0], b[i], odd[i - 1]); + odd[i] = ptx::madc_hi_cc(a[0], b[i], odd[i]); + // for loop carry in carry out + for (j = 2; j < TLC; j+=2) // 2, 4, 6 + { + odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]); + odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]); + } + odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry + + // multiply accumulate odd part of new row with even part prev row (doesnt need a carry) + // j = 1, no carry in, only carry out + even[i + 1] = ptx::mad_lo_cc(a[1], b[i], even[i + 1]); + even[i + 2] = ptx::madc_hi_cc(a[1], b[i], even[i + 2]); + // for loop carry in carry out + for (j = 3; j < TLC; j+=2) + { + even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]); + even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]); + } + + // even bi's + // multiply accumulate even part of new row with even part of prev row // needs a carry + // j = 0, no carry in, only carry out + even[i + 1] = ptx::mad_lo_cc(a[0], b[i + 1], even[i + 1]); + even[i + 2] = ptx::madc_hi_cc(a[0], b[i + 1], even[i + 2]); + // for loop, carry in, carry out. + for (j = 2; j < TLC; j+=2) + { + even[i + j + 1] = ptx::madc_lo_cc(a[j], b[i + 1], even[i + j + 1]); + even[i + j + 2] = ptx::madc_hi_cc(a[j], b[i + 1], even[i + j + 2]); + } + even[i + j + 1] = ptx::addc(even[i + j + 1], 0); // handling last carry + + // multiply accumulate odd part of new row with odd part of prev row + // j = 1, no carry in, only carry out + odd[i + 1] = ptx::mad_lo_cc(a[1], b[i + 1], odd[i + 1]); + odd[i + 2] = ptx::madc_hi_cc(a[1], b[i + 1], odd[i + 2]); + // for loop, carry in, carry out. + for (j = 3; j < TLC; j+=2) + { + odd[i + j] = ptx::madc_lo_cc(a[j], b[i + 1], odd[i + j]); + odd[i + j + 1] = ptx::madc_hi_cc(a[j], b[i + 1], odd[i + j + 1]); + } + + } + + odd[i - 1] = ptx::mad_lo_cc(a[0], b[i], odd[i - 1]); + odd[i] = ptx::madc_hi_cc(a[0], b[i], odd[i]); + // for loop carry in carry out + for (j = 2; j < TLC; j+=2) + { + odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]); + odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]); + } + odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry + + // multiply accumulate odd part of new row with even part prev row + // j = 1, no carry in, only carry out + even[i + 1] = ptx::mad_lo_cc(a[1], b[i], even[i + 1]); + even[i + 2] = ptx::madc_hi_cc(a[1], b[i], even[i + 2]); + // for loop carry in carry out + for (j = 3; j < TLC; j+=2) + { + even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]); + even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]); + } + + // add even and odd parts + even[1] = ptx::add_cc(even[1], odd[0]); + for (i = 1; i < 2 * TLC - 2; i++) + even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]); + even[i + 1] = ptx::addc(even[i + 1], 0); + } + + static DEVICE_INLINE void ingo_msb_multiply_raw_device(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) { + const uint32_t *a = as.limbs; + const uint32_t *b = bs.limbs; + uint32_t *r = rs.limbs; + uint32_t i, j; + uint32_t *even = rs.limbs; + __align__(8) uint32_t odd[2 * TLC]; + for (uint32_t i = 0; i < 2 * TLC; i++) + { + even[i] = 0; + odd[i] = 0; + } + // only last element from first row. + mult_no_carry(b[0], a[TLC - 1], &odd[TLC - 2]); + + // doing two rows at one loop + #pragma unroll + for (i = 1; i < TLC - 1; i+=2) + { + const uint32_t first_active_j = TLC - 1 - i; + const uint32_t first_active_j_odd = first_active_j + (1 - (first_active_j % 2)); + const uint32_t first_active_j_even = first_active_j + first_active_j % 2 ; + // odd bi's + // multiply accumulate even part of new row with odd part prev row (needs a carry) + // j = 0, no carry in, only carry out + odd[first_active_j_even + i - 1] = ptx::mad_lo_cc(a[first_active_j_even], b[i], odd[first_active_j_even + i - 1]); + odd[first_active_j_even + i] = ptx::madc_hi_cc(a[first_active_j_even], b[i], odd[first_active_j_even + i]); + // for loop carry in carry out + #pragma unroll + for (j = first_active_j_even + 2; j < TLC; j+=2) + { + odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]); + odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]); + } + odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry + + // multiply accumulate odd part of new row with even part prev row (doesnt need a carry) + // j = 1, no carry in, only carry out + even[i + first_active_j_odd] = ptx::mad_lo_cc(a[first_active_j_odd], b[i], even[i + first_active_j_odd]); + even[i + first_active_j_odd + 1] = ptx::madc_hi_cc(a[first_active_j_odd], b[i], even[i + first_active_j_odd + 1]); + // for loop carry in carry out + #pragma unroll + for (j = first_active_j_odd + 2; j < TLC; j+=2) + { + even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]); + even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]); + } + + // even bi's + uint32_t const first_active_j1 = TLC - 1 - (i + 1) ; + uint32_t const first_active_j_odd1 = first_active_j1 + (1 - (first_active_j1 % 2)); + uint32_t const first_active_j_even1 = first_active_j1 + first_active_j1 % 2; + // multiply accumulate even part of new row with even part of prev row // needs a carry + // j = 0, no carry in, only carry out + even[first_active_j_even1 + i + 1] = ptx::mad_lo_cc(a[first_active_j_even1], b[i + 1], even[first_active_j_even1 + i + 1]); + even[first_active_j_even1 + i + 2] = ptx::madc_hi_cc(a[first_active_j_even1], b[i + 1], even[first_active_j_even1 + i + 2]); + // for loop, carry in, carry out. + #pragma unroll + for (j = first_active_j_even1 + 2; j < TLC; j+=2) + { + even[i + j + 1] = ptx::madc_lo_cc(a[j], b[i + 1], even[i + j + 1]); + even[i + j + 2] = ptx::madc_hi_cc(a[j], b[i + 1], even[i + j + 2]); + } + even[i + j + 1] = ptx::addc(even[i + j + 1], 0); // handling last carry + + // multiply accumulate odd part of new row with odd part of prev row + // j = 1, no carry in, only carry out + odd[first_active_j_odd1 + i] = ptx::mad_lo_cc(a[first_active_j_odd1], b[i + 1], odd[first_active_j_odd1 + i]); + odd[first_active_j_odd1+ i + 1] = ptx::madc_hi_cc(a[first_active_j_odd1], b[i + 1], odd[first_active_j_odd1 + i + 1]); + // for loop, carry in, carry out. + #pragma unroll + for (j = first_active_j_odd1 + 2; j < TLC; j+=2) + { + odd[i + j] = ptx::madc_lo_cc(a[j], b[i + 1], odd[i + j]); + odd[i + j + 1] = ptx::madc_hi_cc(a[j], b[i + 1], odd[i + j + 1]); + } + + } + + // last round, i = TLC - 1 + odd[i - 1] = ptx::mad_lo_cc(a[0], b[i], odd[i - 1]); + odd[i] = ptx::madc_hi_cc(a[0], b[i], odd[i]); + // for loop carry in carry out + #pragma unroll + for (j = 2; j < TLC; j+=2) + { + odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]); + odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]); + } + odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry + + // multiply accumulate odd part of new row with even part prev row + // j = 1, no carry in, only carry out + even[i + 1] = ptx::mad_lo_cc(a[1], b[i], even[i + 1]); + even[i + 2] = ptx::madc_hi_cc(a[1], b[i], even[i + 2]); + // for loop carry in carry out + #pragma unroll + for (j = 3; j < TLC; j+=2) + { + even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]); + even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]); + } + + // add even and odd parts + even[1] = ptx::add_cc(even[1], odd[0]); + #pragma unroll + for (i = 1; i < 2 * TLC - 2; i++) + even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]); + even[i + 1] = ptx::addc(even[i + 1], 0); + } + + static DEVICE_INLINE void multiply_lsb_raw_device(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) { + // r = a * b is correcrt for the first TLC + 1 digits. (not computing from TLC + 1 to 2*TLC - 2). + const uint32_t *a = as.limbs; + const uint32_t *b = bs.limbs; + uint32_t *even = rs.limbs; + __align__(8) uint32_t odd[2 * TLC - 2]; + mul_n(even, a, b[0]); + mul_n(odd, a + 1, b[0]); + mad_row(&even[2], &odd[0], a, b[1]); + size_t i; + #pragma unroll + for (i = 2; i < TLC - 1; i += 2) { + mad_row(&odd[i], &even[i], a, b[i], TLC - i + 2); + mad_row(&even[i + 2], &odd[i], a, b[i + 1], TLC - i + 2); + } + + // merge |even| and |odd| + even[1] = ptx::add_cc(even[1], odd[0]); + for (i = 1; i < TLC + 1; i++) + even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]); + even[i + 1] = ptx::addc(even[i + 1], 0); + } + + static DEVICE_INLINE void multiply_msb_raw_device(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) { + const uint32_t *a = as.limbs; + const uint32_t *b = bs.limbs; + uint32_t *even = rs.limbs; + __align__(8) uint32_t odd[2 * TLC - 2]; + for (int i=0; i<2*TLC - 1; i++) + { + even[i] = 0; + odd[i] = 0; + } + uint32_t min_indexes_sum = TLC - 1; + // only diagonal + mul_n_msb(even, a, b[0], TLC, min_indexes_sum); + mul_n_msb(odd, a + 1, b[0], TLC, min_indexes_sum - 1); + mad_row_msb(&even[2], &odd[0], a, b[1], TLC, min_indexes_sum - 1); + size_t i; + #pragma unroll + for (i = 2; i < TLC - 1; i += 2) { + mad_row(&odd[i], &even[i], a, b[i]); + mad_row(&even[i + 2], &odd[i], a, b[i + 1]); + } + // merge |even| and |odd| + even[1] = ptx::add_cc(even[1], odd[0]); + for (i = 1; i < 2 * TLC - 2; i++) + even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]); + even[i + 1] = ptx::addc(even[i + 1], 0); + } + static HOST_INLINE void multiply_raw_host(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) { const uint32_t *a = as.limbs; const uint32_t *b = bs.limbs; uint32_t *r = rs.limbs; for (unsigned i = 0; i < TLC; i++) { uint32_t carry = 0; - for (unsigned j = 0; j < TLC; j++) + for (unsigned j = 0; j < TLC; j++) r[j + i] = host_math::madc_cc(a[j], b[i], r[j + i], carry); r[TLC + i] = carry; } @@ -315,6 +650,22 @@ template class Field { #endif } + static HOST_DEVICE_INLINE void multiply_raw_lsb(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) { + #ifdef __CUDA_ARCH__ + return multiply_lsb_raw_device(as, bs, rs); + #else + return multiply_raw_host(as, bs, rs); + #endif + } + + static HOST_DEVICE_INLINE void multiply_raw_msb(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) { + #ifdef __CUDA_ARCH__ + return multiply_raw_device(as, bs, rs); + #else + return multiply_raw_host(as, bs, rs); + #endif + } + public: ff_storage limbs_storage; @@ -345,7 +696,7 @@ template class Field { return value; } - template static constexpr HOST_DEVICE_INLINE Field reduce(const Field &xs) { + template static constexpr HOST_DEVICE_INLINE Field sub_modulus(const Field &xs) { if (REDUCTION_SIZE == 0) return xs; const ff_storage modulus = get_modulus(); @@ -368,7 +719,7 @@ template class Field { friend HOST_DEVICE_INLINE Field operator+(Field xs, const Field& ys) { Field rs = {}; add_limbs(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage); - return reduce<1>(rs); + return sub_modulus<1>(rs); } friend HOST_DEVICE_INLINE Field operator-(Field xs, const Field& ys) { @@ -388,20 +739,49 @@ template class Field { return rs; } - friend HOST_DEVICE_INLINE Field operator*(const Field& xs, const Field& ys) { - Wide xy = mul_wide(xs, ys); - Field xy_hi = xy.get_higher_with_slack(); + static constexpr DEVICE_INLINE uint32_t sub_limbs_partial_device(uint32_t *x, uint32_t *y, uint32_t *r, uint32_t num_limbs) { + r[0] = ptx::sub_cc(x[0], y[0]); + #pragma unroll + for (unsigned i = 1; i < num_limbs; i++) + r[i] = ptx::subc_cc(x[i], y[i]); + return ptx::subc(0, 0); + } + + static constexpr HOST_DEVICE_INLINE uint32_t sub_limbs_partial(uint32_t *x, uint32_t *y, uint32_t *r, uint32_t num_limbs) { + #ifdef __CUDA_ARCH__ + return sub_limbs_partial_device(x, y, r, num_limbs); + #else + return sub_limbs_partial_host(x, y, r, num_limbs); + #endif + } + + template + static constexpr HOST_DEVICE_INLINE Field reduce(const Wide& xs) { + Field xs_hi = Wide::get_higher_with_slack(xs); // xy << slack_bits Wide l = {}; - multiply_raw(xy_hi.limbs_storage, get_m(), l.limbs_storage); - Field l_hi = l.get_higher_with_slack(); + multiply_raw_msb(xs_hi.limbs_storage, get_m(), l.limbs_storage); // MSB mult + Field l_hi = Wide::get_higher_with_slack(l); Wide lp = {}; - multiply_raw(l_hi.limbs_storage, get_modulus(), lp.limbs_storage); - Wide r_wide = xy - lp; + multiply_raw_lsb(l_hi.limbs_storage, get_modulus(), lp.limbs_storage); // LSB mult + Wide r_wide = xs - lp; Wide r_wide_reduced = {}; - uint32_t reduced = sub_limbs(r_wide.limbs_storage, modulus_wide(), r_wide_reduced.limbs_storage); - r_wide = reduced ? r_wide : r_wide_reduced; - Field r = r_wide.get_lower(); - return reduce<1>(r); + for (unsigned i = 0; i < TLC + 1; i++) + { + uint32_t carry = sub_limbs_partial(r_wide.limbs_storage.limbs, modulus_wide().limbs, r_wide_reduced.limbs_storage.limbs, TLC + 1); + if (carry == 0) // continue to reduce + r_wide = r_wide_reduced; + else // done + break; + } + + // number of wrap around is bounded by TLC + 1 times. + Field r = Wide::get_lower(r_wide); + return r; + } + + friend HOST_DEVICE_INLINE Field operator*(const Field& xs, const Field& ys) { + Wide xy = mul_wide(xs, ys); // full mult + return reduce(xy); } friend HOST_DEVICE_INLINE bool operator==(const Field& xs, const Field& ys) { @@ -425,8 +805,19 @@ template class Field { return !(xs == ys); } - template static constexpr HOST_DEVICE_INLINE T mul_const(const T &xs) { - return mul_unsigned(xs); + template + static HOST_DEVICE_INLINE Field mul_const(const Field& xs) { + Field mul = multiplier; + static bool is_u32 = true; + #ifdef __CUDA_ARCH__ + #pragma unroll + #endif + for (unsigned i = 1; i < TLC; i++) + is_u32 &= (mul.limbs_storage.limbs[i] == 0); + + if (is_u32) + return mul_unsigned(xs); + return mul * xs; } template @@ -485,7 +876,7 @@ template class Field { #endif } r[TLC - 1] = x[TLC - 1] >> 1; - return reduce(rs); + return sub_modulus(rs); } static constexpr HOST_DEVICE_INLINE bool lt(const Field &xs, const Field &ys) { diff --git a/icicle/primitives/projective.cuh b/icicle/primitives/projective.cuh index d73d711f7..5ba274818 100644 --- a/icicle/primitives/projective.cuh +++ b/icicle/primitives/projective.cuh @@ -33,47 +33,47 @@ class Projective { } friend HOST_DEVICE_INLINE Projective operator+(Projective p1, const Projective& p2) { - const FF X1 = p1.x; // < 2 - const FF Y1 = p1.y; // < 2 - const FF Z1 = p1.z; // < 2 - const FF X2 = p2.x; // < 2 - const FF Y2 = p2.y; // < 2 - const FF Z2 = p2.z; // < 2 - const FF t00 = X1 * X2; // t00 ← X1 · X2 < 2 - const FF t01 = Y1 * Y2; // t01 ← Y1 · Y2 < 2 - const FF t02 = Z1 * Z2; // t02 ← Z1 · Z2 < 2 - const FF t03 = X1 + Y1; // t03 ← X1 + Y1 < 4 - const FF t04 = X2 + Y2; // t04 ← X2 + Y2 < 4 - const FF t05 = t03 * t04; // t03 ← t03 · t04 < 3 - const FF t06 = t00 + t01; // t06 ← t00 + t01 < 4 - const FF t07 = t05 - t06; // t05 ← t05 − t06 < 2 - const FF t08 = Y1 + Z1; // t08 ← Y1 + Z1 < 4 - const FF t09 = Y2 + Z2; // t09 ← Y2 + Z2 < 4 - const FF t10 = t08 * t09; // t10 ← t08 · t09 < 3 - const FF t11 = t01 + t02; // t11 ← t01 + t02 < 4 - const FF t12 = t10 - t11; // t12 ← t10 − t11 < 2 - const FF t13 = X1 + Z1; // t13 ← X1 + Z1 < 4 - const FF t14 = X2 + Z2; // t14 ← X2 + Z2 < 4 - const FF t15 = t13 * t14; // t15 ← t13 · t14 < 3 - const FF t16 = t00 + t02; // t16 ← t00 + t02 < 4 - const FF t17 = t15 - t16; // t17 ← t15 − t16 < 2 - const FF t18 = t00 + t00; // t18 ← t00 + t00 < 2 - const FF t19 = t18 + t00; // t19 ← t18 + t00 < 2 + const FF X1 = p1.x; // < 2 + const FF Y1 = p1.y; // < 2 + const FF Z1 = p1.z; // < 2 + const FF X2 = p2.x; // < 2 + const FF Y2 = p2.y; // < 2 + const FF Z2 = p2.z; // < 2 + const FF t00 = X1 * X2; // t00 ← X1 · X2 < 2 + const FF t01 = Y1 * Y2; // t01 ← Y1 · Y2 < 2 + const FF t02 = Z1 * Z2; // t02 ← Z1 · Z2 < 2 + const FF t03 = X1 + Y1; // t03 ← X1 + Y1 < 4 + const FF t04 = X2 + Y2; // t04 ← X2 + Y2 < 4 + const FF t05 = t03 * t04; // t03 ← t03 · t04 < 3 + const FF t06 = t00 + t01; // t06 ← t00 + t01 < 4 + const FF t07 = t05 - t06; // t05 ← t05 − t06 < 2 + const FF t08 = Y1 + Z1; // t08 ← Y1 + Z1 < 4 + const FF t09 = Y2 + Z2; // t09 ← Y2 + Z2 < 4 + const FF t10 = t08 * t09; // t10 ← t08 · t09 < 3 + const FF t11 = t01 + t02; // t11 ← t01 + t02 < 4 + const FF t12 = t10 - t11; // t12 ← t10 − t11 < 2 + const FF t13 = X1 + Z1; // t13 ← X1 + Z1 < 4 + const FF t14 = X2 + Z2; // t14 ← X2 + Z2 < 4 + const FF t15 = t13 * t14; // t15 ← t13 · t14 < 3 + const FF t16 = t00 + t02; // t16 ← t00 + t02 < 4 + const FF t17 = t15 - t16; // t17 ← t15 − t16 < 2 + const FF t18 = t00 + t00; // t18 ← t00 + t00 < 2 + const FF t19 = t18 + t00; // t19 ← t18 + t00 < 2 const FF t20 = FF::template mul_unsigned<3>( - FF::template mul_const(t02)); // t20 ← b3 · t02 < 2 - const FF t21 = t01 + t20; // t21 ← t01 + t20 < 2 - const FF t22 = t01 - t20; // t22 ← t01 − t20 < 2 + FF::template mul_const(t02)); // t20 ← b3 · t02 < 2 + const FF t21 = t01 + t20; // t21 ← t01 + t20 < 2 + const FF t22 = t01 - t20; // t22 ← t01 − t20 < 2 const FF t23 = FF::template mul_unsigned<3>( - FF::template mul_const(t17)); // t23 ← b3 · t17 < 2 - const FF t24 = t12 * t23; // t24 ← t12 · t23 < 2 - const FF t25 = t07 * t22; // t25 ← t07 · t22 < 2 - const FF X3 = t25 - t24; // X3 ← t25 − t24 < 2 - const FF t27 = t23 * t19; // t27 ← t23 · t19 < 2 - const FF t28 = t22 * t21; // t28 ← t22 · t21 < 2 - const FF Y3 = t28 + t27; // Y3 ← t28 + t27 < 2 - const FF t30 = t19 * t07; // t30 ← t19 · t07 < 2 - const FF t31 = t21 * t12; // t31 ← t21 · t12 < 2 - const FF Z3 = t31 + t30; // Z3 ← t31 + t30 < 2 + FF::template mul_const(t17)); // t23 ← b3 · t17 < 2 + const auto t24 = FF::mul_wide(t12, t23); // t24 ← t12 · t23 < 2 + const auto t25 = FF::mul_wide(t07, t22); // t25 ← t07 · t22 < 2 + const FF X3 = FF::reduce(t25 - t24); // X3 ← t25 − t24 < 2 + const auto t27 = FF::mul_wide(t23, t19); // t27 ← t23 · t19 < 2 + const auto t28 = FF::mul_wide(t22, t21); // t28 ← t22 · t21 < 2 + const FF Y3 = FF::reduce(t28 + t27); // Y3 ← t28 + t27 < 2 + const auto t30 = FF::mul_wide(t19, t07); // t30 ← t19 · t07 < 2 + const auto t31 = FF::mul_wide(t21, t12); // t31 ← t21 · t12 < 2 + const FF Z3 = FF::reduce(t31 + t30); // Z3 ← t31 + t30 < 2 return {X3, Y3, Z3}; } @@ -82,13 +82,47 @@ class Projective { } friend HOST_DEVICE_INLINE Projective operator+(Projective p1, const Affine& p2) { - // TODO: change the implementation to a more efficient mixed adder later on - return p1 + from_affine(p2); - } - - friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Projective& point) { - os << "Point { x: " << point.x << "; y: " << point.y << "; z: " << point.z << " }"; - return os; + const FF X1 = p1.x; // < 2 + const FF Y1 = p1.y; // < 2 + const FF Z1 = p1.z; // < 2 + const FF X2 = p2.x; // < 2 + const FF Y2 = p2.y; // < 2 + const FF t00 = X1 * X2; // t00 ← X1 · X2 < 2 + const FF t01 = Y1 * Y2; // t01 ← Y1 · Y2 < 2 + const FF t02 = Z1; // t02 ← Z1 < 2 + const FF t03 = X1 + Y1; // t03 ← X1 + Y1 < 4 + const FF t04 = X2 + Y2; // t04 ← X2 + Y2 < 4 + const FF t05 = t03 * t04; // t03 ← t03 · t04 < 3 + const FF t06 = t00 + t01; // t06 ← t00 + t01 < 4 + const FF t07 = t05 - t06; // t05 ← t05 − t06 < 2 + const FF t08 = Y1 + Z1; // t08 ← Y1 + Z1 < 4 + const FF t09 = Y2 + FF::one(); // t09 ← Y2 + 1 < 4 + const FF t10 = t08 * t09; // t10 ← t08 · t09 < 3 + const FF t11 = t01 + t02; // t11 ← t01 + t02 < 4 + const FF t12 = t10 - t11; // t12 ← t10 − t11 < 2 + const FF t13 = X1 + Z1; // t13 ← X1 + Z1 < 4 + const FF t14 = X2 + FF::one(); // t14 ← X2 + 1 < 4 + const FF t15 = t13 * t14; // t15 ← t13 · t14 < 3 + const FF t16 = t00 + t02; // t16 ← t00 + t02 < 4 + const FF t17 = t15 - t16; // t17 ← t15 − t16 < 2 + const FF t18 = t00 + t00; // t18 ← t00 + t00 < 2 + const FF t19 = t18 + t00; // t19 ← t18 + t00 < 2 + const FF t20 = FF::template mul_unsigned<3>( + FF::template mul_const(t02)); // t20 ← b3 · t02 < 2 + const FF t21 = t01 + t20; // t21 ← t01 + t20 < 2 + const FF t22 = t01 - t20; // t22 ← t01 − t20 < 2 + const FF t23 = FF::template mul_unsigned<3>( + FF::template mul_const(t17)); // t23 ← b3 · t17 < 2 + const auto t24 = FF::mul_wide(t12, t23); // t24 ← t12 · t23 < 2 + const auto t25 = FF::mul_wide(t07, t22); // t25 ← t07 · t22 < 2 + const FF X3 = FF::reduce(t25 - t24); // X3 ← t25 − t24 < 2 + const auto t27 = FF::mul_wide(t23, t19); // t27 ← t23 · t19 < 2 + const auto t28 = FF::mul_wide(t22, t21); // t28 ← t22 · t21 < 2 + const FF Y3 = FF::reduce(t28 + t27); // Y3 ← t28 + t27 < 2 + const auto t30 = FF::mul_wide(t19, t07); // t30 ← t19 · t07 < 2 + const auto t31 = FF::mul_wide(t21, t12); // t31 ← t21 · t12 < 2 + const FF Z3 = FF::reduce(t31 + t30); // Z3 ← t31 + t30 < 2 + return {X3, Y3, Z3}; } friend HOST_DEVICE_INLINE Projective operator-(Projective p1, const Affine& p2) { @@ -115,6 +149,11 @@ class Projective { return (p1.x * p2.z == p2.x * p1.z) && (p1.y * p2.z == p2.y * p1.z); } + friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Projective& point) { + os << "Point { x: " << point.x << "; y: " << point.y << "; z: " << point.z << " }"; + return os; + } + static HOST_DEVICE_INLINE bool is_zero(const Projective &point) { return point.x == FF::zero() && point.y != FF::zero() && point.z == FF::zero(); } diff --git a/icicle/primitives/test.cu b/icicle/primitives/test.cu index 8870cbe3d..adc6572d5 100644 --- a/icicle/primitives/test.cu +++ b/icicle/primitives/test.cu @@ -1,8 +1,9 @@ #include #include - #include "test_kernels.cuh" - +#include +#include +namespace mp = boost::multiprecision; template int device_populate_random(T* d_elements, unsigned n) { @@ -20,49 +21,63 @@ int device_set(T* d_elements, T el, unsigned n) { return cudaMemcpy(d_elements, h_elements, sizeof(T) * n, cudaMemcpyHostToDevice); } +mp::int1024_t convert_to_boost_mp(uint32_t *a, uint32_t length) +{ + mp::int1024_t res = 0; + for (uint32_t i = 0; i < length; i++) + { + res += (mp::int1024_t)(a[i]) << 32 * i; + } + return res; +} + class PrimitivesTest : public ::testing::Test { protected: - static const unsigned n = 1 << 5; - - proj *points1{}; - proj *points2{}; - g2_proj *g2_points1{}; - g2_proj *g2_points2{}; - scalar_field *scalars1{}; - scalar_field *scalars2{}; - proj *zero_points{}; - g2_proj *g2_zero_points{}; - scalar_field *zero_scalars{}; - scalar_field *one_scalars{}; - affine *aff_points{}; - g2_affine *g2_aff_points{}; - proj *res_points1{}; - proj *res_points2{}; - g2_proj *g2_res_points1{}; - g2_proj *g2_res_points2{}; - scalar_field *res_scalars1{}; - scalar_field *res_scalars2{}; + static const unsigned n = 1 << 4; + + projective_t *points1{}; + projective_t *points2{}; + g2_projective_t *g2_points1{}; + g2_projective_t *g2_points2{}; + scalar_field_t *scalars1{}; + scalar_field_t *scalars2{}; + projective_t *zero_points{}; + g2_projective_t *g2_zero_points{}; + scalar_field_t *zero_scalars{}; + scalar_field_t *one_scalars{}; + affine_t *aff_points{}; + g2_affine_t *g2_aff_points{}; + projective_t *res_points1{}; + projective_t *res_points2{}; + g2_projective_t *g2_res_points1{}; + g2_projective_t *g2_res_points2{}; + scalar_field_t *res_scalars1{}; + scalar_field_t *res_scalars2{}; + scalar_field_t::Wide *res_scalars_wide{}; + scalar_field_t::Wide *res_scalars_wide_full{}; PrimitivesTest() { assert(!cudaDeviceReset()); - assert(!cudaMallocManaged(&points1, n * sizeof(proj))); - assert(!cudaMallocManaged(&points2, n * sizeof(proj))); - assert(!cudaMallocManaged(&g2_points1, n * sizeof(g2_proj))); - assert(!cudaMallocManaged(&g2_points2, n * sizeof(g2_proj))); - assert(!cudaMallocManaged(&scalars1, n * sizeof(scalar_field))); - assert(!cudaMallocManaged(&scalars2, n * sizeof(scalar_field))); - assert(!cudaMallocManaged(&zero_points, n * sizeof(proj))); - assert(!cudaMallocManaged(&g2_zero_points, n * sizeof(g2_proj))); - assert(!cudaMallocManaged(&zero_scalars, n * sizeof(scalar_field))); - assert(!cudaMallocManaged(&one_scalars, n * sizeof(scalar_field))); - assert(!cudaMallocManaged(&aff_points, n * sizeof(affine))); - assert(!cudaMallocManaged(&g2_aff_points, n * sizeof(g2_affine))); - assert(!cudaMallocManaged(&res_points1, n * sizeof(proj))); - assert(!cudaMallocManaged(&res_points2, n * sizeof(proj))); - assert(!cudaMallocManaged(&g2_res_points1, n * sizeof(g2_proj))); - assert(!cudaMallocManaged(&g2_res_points2, n * sizeof(g2_proj))); - assert(!cudaMallocManaged(&res_scalars1, n * sizeof(scalar_field))); - assert(!cudaMallocManaged(&res_scalars2, n * sizeof(scalar_field))); + assert(!cudaMallocManaged(&points1, n * sizeof(projective_t))); + assert(!cudaMallocManaged(&points2, n * sizeof(projective_t))); + assert(!cudaMallocManaged(&g2_points1, n * sizeof(g2_projective_t))); + assert(!cudaMallocManaged(&g2_points2, n * sizeof(g2_projective_t))); + assert(!cudaMallocManaged(&scalars1, n * sizeof(scalar_field_t))); + assert(!cudaMallocManaged(&scalars2, n * sizeof(scalar_field_t))); + assert(!cudaMallocManaged(&zero_points, n * sizeof(projective_t))); + assert(!cudaMallocManaged(&g2_zero_points, n * sizeof(g2_projective_t))); + assert(!cudaMallocManaged(&zero_scalars, n * sizeof(scalar_field_t))); + assert(!cudaMallocManaged(&one_scalars, n * sizeof(scalar_field_t))); + assert(!cudaMallocManaged(&aff_points, n * sizeof(affine_t))); + assert(!cudaMallocManaged(&g2_aff_points, n * sizeof(g2_affine_t))); + assert(!cudaMallocManaged(&res_points1, n * sizeof(projective_t))); + assert(!cudaMallocManaged(&res_points2, n * sizeof(projective_t))); + assert(!cudaMallocManaged(&g2_res_points1, n * sizeof(g2_projective_t))); + assert(!cudaMallocManaged(&g2_res_points2, n * sizeof(g2_projective_t))); + assert(!cudaMallocManaged(&res_scalars1, n * sizeof(scalar_field_t))); + assert(!cudaMallocManaged(&res_scalars2, n * sizeof(scalar_field_t))); + assert(!cudaMallocManaged(&res_scalars_wide, n * sizeof(scalar_field_t::Wide))); + assert(!cudaMallocManaged(&res_scalars_wide_full, n * sizeof(scalar_field_t::Wide))); } ~PrimitivesTest() override { @@ -84,28 +99,34 @@ protected: cudaFree(g2_res_points2); cudaFree(res_scalars1); cudaFree(res_scalars2); + + cudaFree(res_scalars_wide); + cudaFree(res_scalars_wide_full); + cudaDeviceReset(); } void SetUp() override { - ASSERT_EQ(device_populate_random(points1, n), cudaSuccess); - ASSERT_EQ(device_populate_random(points2, n), cudaSuccess); - ASSERT_EQ(device_populate_random(g2_points1, n), cudaSuccess); - ASSERT_EQ(device_populate_random(g2_points2, n), cudaSuccess); - ASSERT_EQ(device_populate_random(scalars1, n), cudaSuccess); - ASSERT_EQ(device_populate_random(scalars2, n), cudaSuccess); - ASSERT_EQ(device_set(zero_points, proj::zero(), n), cudaSuccess); - ASSERT_EQ(device_set(g2_zero_points, g2_proj::zero(), n), cudaSuccess); - ASSERT_EQ(device_set(zero_scalars, scalar_field::zero(), n), cudaSuccess); - ASSERT_EQ(device_set(one_scalars, scalar_field::one(), n), cudaSuccess); - ASSERT_EQ(cudaMemset(aff_points, 0, n * sizeof(affine)), cudaSuccess); - ASSERT_EQ(cudaMemset(g2_aff_points, 0, n * sizeof(g2_affine)), cudaSuccess); - ASSERT_EQ(cudaMemset(res_points1, 0, n * sizeof(proj)), cudaSuccess); - ASSERT_EQ(cudaMemset(res_points2, 0, n * sizeof(proj)), cudaSuccess); - ASSERT_EQ(cudaMemset(g2_res_points1, 0, n * sizeof(g2_proj)), cudaSuccess); - ASSERT_EQ(cudaMemset(g2_res_points2, 0, n * sizeof(g2_proj)), cudaSuccess); - ASSERT_EQ(cudaMemset(res_scalars1, 0, n * sizeof(scalar_field)), cudaSuccess); - ASSERT_EQ(cudaMemset(res_scalars2, 0, n * sizeof(scalar_field)), cudaSuccess); + ASSERT_EQ(device_populate_random(points1, n), cudaSuccess); + ASSERT_EQ(device_populate_random(points2, n), cudaSuccess); + ASSERT_EQ(device_populate_random(g2_points1, n), cudaSuccess); + ASSERT_EQ(device_populate_random(g2_points2, n), cudaSuccess); + ASSERT_EQ(device_populate_random(scalars1, n), cudaSuccess); + ASSERT_EQ(device_populate_random(scalars2, n), cudaSuccess); + ASSERT_EQ(device_set(zero_points, projective_t::zero(), n), cudaSuccess); + ASSERT_EQ(device_set(g2_zero_points, g2_projective_t::zero(), n), cudaSuccess); + ASSERT_EQ(device_set(zero_scalars, scalar_field_t::zero(), n), cudaSuccess); + ASSERT_EQ(device_set(one_scalars, scalar_field_t::one(), n), cudaSuccess); + ASSERT_EQ(cudaMemset(aff_points, 0, n * sizeof(affine_t)), cudaSuccess); + ASSERT_EQ(cudaMemset(g2_aff_points, 0, n * sizeof(g2_affine_t)), cudaSuccess); + ASSERT_EQ(cudaMemset(res_points1, 0, n * sizeof(projective_t)), cudaSuccess); + ASSERT_EQ(cudaMemset(res_points2, 0, n * sizeof(projective_t)), cudaSuccess); + ASSERT_EQ(cudaMemset(g2_res_points1, 0, n * sizeof(g2_projective_t)), cudaSuccess); + ASSERT_EQ(cudaMemset(g2_res_points2, 0, n * sizeof(g2_projective_t)), cudaSuccess); + ASSERT_EQ(cudaMemset(res_scalars1, 0, n * sizeof(scalar_field_t)), cudaSuccess); + ASSERT_EQ(cudaMemset(res_scalars2, 0, n * sizeof(scalar_field_t)), cudaSuccess); + ASSERT_EQ(cudaMemset(res_scalars_wide, 0, n * sizeof(scalar_field_t::Wide)), cudaSuccess); + ASSERT_EQ(cudaMemset(res_scalars_wide_full, 0, n * sizeof(scalar_field_t::Wide)), cudaSuccess); } }; @@ -183,7 +204,7 @@ TEST_F(PrimitivesTest, FieldMultiplicationSqrEq) { TEST_F(PrimitivesTest, ECRandomPointsAreOnCurve) { for (unsigned i = 0; i < n; i++) - ASSERT_PRED1(proj::is_on_curve, points1[i]); + ASSERT_PRED1(projective_t::is_on_curve, points1[i]); } TEST_F(PrimitivesTest, ECPointAdditionSubtractionCancel) { @@ -260,7 +281,7 @@ TEST_F(PrimitivesTest, ECScalarMultiplicationIsDistributiveOverAddition) { TEST_F(PrimitivesTest, ECProjectiveToAffine) { ASSERT_EQ(point_vec_to_affine(points1, aff_points, n), cudaSuccess); for (unsigned i = 0; i < n; i++) - ASSERT_EQ(points1[i], proj::from_affine(aff_points[i])); + ASSERT_EQ(points1[i], projective_t::from_affine(aff_points[i])); } TEST_F(PrimitivesTest, ECMixedPointAddition) { @@ -279,9 +300,192 @@ TEST_F(PrimitivesTest, ECMixedAdditionOfNegatedPointEqSubtraction) { ASSERT_EQ(res_points1[i], points1[i] + res_points2[i]); } +TEST_F(PrimitivesTest, MP_LSB_MULT) { + // LSB multiply, check correctness of first TLC + 1 digits result. + ASSERT_EQ(mp_lsb_mult(scalars1, scalars2, res_scalars_wide), cudaSuccess); + std::cout << "first GPU lsb mult output = 0x"; + for (int i=0; i<2*scalar_field_t::TLC; i++) + { + std::cout << std::hex << res_scalars_wide[0].limbs_storage.limbs[i]; + } + std::cout << std::endl; + + + ASSERT_EQ(mp_mult(scalars1, scalars2, res_scalars_wide_full), cudaSuccess); + std::cout << "first GPU full mult output = 0x"; + for (int i=0; i<2*scalar_field_t::TLC; i++) + { + std::cout << std::hex << res_scalars_wide_full[0].limbs_storage.limbs[i]; + } + std::cout << std::endl; + for (int j = 0; j < n; j++) + { + for (int i=0; i=0 ; i--) + { + std::cout << std::hex << res_scalars_wide[0].limbs_storage.limbs[i] << " "; + } + std::cout << std::endl; + + + ASSERT_EQ(mp_mult(scalars1, scalars2, res_scalars_wide_full), cudaSuccess); + std::cout << "first GPU full mult output = 0x"; + for (int i=2*scalar_field_t::TLC - 1; i >=0 ; i--) + { + std::cout << std::hex << res_scalars_wide_full[0].limbs_storage.limbs[i] << " "; + } + + std::cout << std::endl; + + for (int i=0; i < 2*scalar_field_t::TLC - 1; i++) + { + if (res_scalars_wide_full[0].limbs_storage.limbs[i] == res_scalars_wide[0].limbs_storage.limbs[i]) + std::cout << "matched word idx = " << i << std::endl; + } + +} + +TEST_F(PrimitivesTest, INGO_MP_MULT) { + // MSB multiply, take n msb bits of multiplication, assert that the error is up to 1. + ASSERT_EQ(ingo_mp_mult(scalars1, scalars2, res_scalars_wide), cudaSuccess); + std::cout << "INGO = 0x"; + for (int i=0; i < 2*scalar_field_t::TLC ; i++) + { + std::cout << std::hex << res_scalars_wide[0].limbs_storage.limbs[i] << " "; + } + std::cout << std::endl; + + + ASSERT_EQ(mp_mult(scalars1, scalars2, res_scalars_wide_full), cudaSuccess); + std::cout << "ZKSYNC = 0x"; + for (int i=0; i < 2*scalar_field_t::TLC ; i++) + { + std::cout << std::hex << res_scalars_wide_full[0].limbs_storage.limbs[i] << " "; + } + + std::cout << std::endl; + + for (int i=0; i < 2*scalar_field_t::TLC - 1; i++) + { + if (res_scalars_wide_full[0].limbs_storage.limbs[i] == res_scalars_wide[0].limbs_storage.limbs[i]) + std::cout << "matched word idx = " << i << std::endl; + } + for (int j=0; j= 0 ; i--) + { + std::cout << std::hex << res_scalars_wide[0].limbs_storage.limbs[i] << " "; + } + std::cout << std::endl; + + ASSERT_EQ(mp_mult(scalars1, scalars2, res_scalars_wide_full), cudaSuccess); + std::cout << "ZKSYNC = 0x"; + for (int i=2*scalar_field_t::TLC - 1; i >= 0 ; i--) + { + std::cout << std::hex << res_scalars_wide_full[0].limbs_storage.limbs[i] << " "; + } + + std::cout << std::endl; + + + // for (int i=scalar_field::TLC; i < 2*scalar_field::TLC - 1; i++) + // { + // ASSERT_EQ(in_bound, true); + // } + // for (int j=0; j> (num_limbs * 32); + res_gpu = convert_to_boost_mp(&(res_scalars_wide[j]).limbs_storage.limbs[num_limbs], num_limbs); + std::cout << "res mp = " << res_mp << std::endl; + std::cout << "res gpu = " << res_gpu << std::endl; + std::cout << "error = " << res_mp - res_gpu << std::endl; + bool upper_bound = res_gpu <= res_mp; + bool lower_bound = res_gpu > (res_mp - num_limbs); + bool in_bound = upper_bound && lower_bound; + + + ASSERT_EQ(in_bound, true); + } +} + +TEST_F(PrimitivesTest, INGO_MP_MOD_MULT) { + std::cout << " taking num limbs " << std::endl; + uint32_t num_limbs = scalar_field_t::TLC; + std::cout << " calling gpu... = " << std::endl; + ASSERT_EQ(ingo_mp_mod_mult(scalars1, scalars2, res_scalars1, n), cudaSuccess); + std::cout << " gpu call done " << std::endl; + // mp testing + mp::int1024_t scalar_1_mp = 0; + mp::int1024_t scalar_2_mp = 0; + mp::int1024_t res_mp = 0; + mp::int1024_t res_gpu = 0; + mp::int1024_t p = convert_to_boost_mp(scalar_field_t::get_modulus().limbs, num_limbs); + std::cout << " p = " << p << std::endl; + + + for (int j=0; j scalar_field; -typedef Field base_field; -typedef Affine affine; -static constexpr base_field b = base_field{ weierstrass_b }; -typedef Projective proj; -typedef ExtensionField base_extension_field; -typedef Affine g2_affine; -static constexpr base_extension_field b2 = base_extension_field{ base_field {b_re}, base_field {b_im}}; -typedef Projective g2_proj; +#endif +using namespace BN254; template __global__ void add_elements_kernel(const T1 *x, const T2 *y, T1 *result, const unsigned count) { @@ -72,27 +68,27 @@ template int vec_mul(const F *x, const G *y, G *result, const return error ? error : cudaDeviceSynchronize(); } -__global__ void inv_field_elements_kernel(const scalar_field *x, scalar_field *result, const unsigned count) { +__global__ void inv_field_elements_kernel(const scalar_field_t *x, scalar_field_t *result, const unsigned count) { const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x; if (gid >= count) return; - result[gid] = scalar_field::inverse(x[gid]); + result[gid] = scalar_field_t::inverse(x[gid]); } -int field_vec_inv(const scalar_field *x, scalar_field *result, const unsigned count) { +int field_vec_inv(const scalar_field_t *x, scalar_field_t *result, const unsigned count) { inv_field_elements_kernel<<<(count - 1) / 32 + 1, 32>>>(x, result, count); int error = cudaGetLastError(); return error ? error : cudaDeviceSynchronize(); } -__global__ void sqr_field_elements_kernel(const scalar_field *x, scalar_field *result, const unsigned count) { +__global__ void sqr_field_elements_kernel(const scalar_field_t *x, scalar_field_t *result, const unsigned count) { const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x; if (gid >= count) return; - result[gid] = scalar_field::sqr(x[gid]); + result[gid] = scalar_field_t::sqr(x[gid]); } -int field_vec_sqr(const scalar_field *x, scalar_field *result, const unsigned count) { +int field_vec_sqr(const scalar_field_t *x, scalar_field_t *result, const unsigned count) { sqr_field_elements_kernel<<<(count - 1) / 32 + 1, 32>>>(x, result, count); int error = cudaGetLastError(); return error ? error : cudaDeviceSynchronize(); @@ -111,3 +107,87 @@ template int point_vec_to_affine(const P *x, A *result, const int error = cudaGetLastError(); return error ? error : cudaDeviceSynchronize(); } + + +__global__ void mp_mult_kernel(const scalar_field_t *x, const scalar_field_t *y, scalar_field_t::Wide *result) { + const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x; + scalar_field_t::multiply_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage); +} + + +int mp_mult(const scalar_field_t *x, scalar_field_t *y, scalar_field_t::Wide *result) +{ + mp_mult_kernel<<<1, 32>>>(x, y, result); + int error = cudaGetLastError(); + return error ? error : cudaDeviceSynchronize(); +} + + + +__global__ void mp_lsb_mult_kernel(const scalar_field_t *x, const scalar_field_t *y, scalar_field_t::Wide *result) { + const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x; + scalar_field_t::multiply_lsb_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage); +} + + +int mp_lsb_mult(const scalar_field_t *x, scalar_field_t *y, scalar_field_t::Wide *result) +{ + mp_lsb_mult_kernel<<<1, 32>>>(x, y, result); + int error = cudaGetLastError(); + return error ? error : cudaDeviceSynchronize(); +} + +__global__ void mp_msb_mult_kernel(const scalar_field_t *x, const scalar_field_t *y, scalar_field_t::Wide *result) { + const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x; + scalar_field_t::multiply_msb_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage); +} + + +int mp_msb_mult(const scalar_field_t *x, scalar_field_t *y, scalar_field_t::Wide *result) +{ + mp_msb_mult_kernel<<<1, 1>>>(x, y, result); + int error = cudaGetLastError(); + return error ? error : cudaDeviceSynchronize(); +} + + +__global__ void ingo_mp_mult_kernel(const scalar_field_t *x, const scalar_field_t *y, scalar_field_t::Wide *result) { + const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x; + scalar_field_t::ingo_multiply_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage); +} + + +int ingo_mp_mult(const scalar_field_t *x, scalar_field_t *y, scalar_field_t::Wide *result) +{ + ingo_mp_mult_kernel<<<1, 32>>>(x, y, result); + int error = cudaGetLastError(); + return error ? error : cudaDeviceSynchronize(); +} + + +__global__ void ingo_mp_msb_mult_kernel(const scalar_field_t *x, const scalar_field_t *y, scalar_field_t::Wide *result) { + const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x; + scalar_field_t::ingo_msb_multiply_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage); +} + + +int ingo_mp_msb_mult(const scalar_field_t *x, scalar_field_t *y, scalar_field_t::Wide *result, const unsigned n) +{ + ingo_mp_msb_mult_kernel<<<1, n>>>(x, y, result); + int error = cudaGetLastError(); + return error ? error : cudaDeviceSynchronize(); +} + + +__global__ void ingo_mp_mod_mult_kernel(const scalar_field_t *x, const scalar_field_t *y, scalar_field_t *result) { + const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x; + result[gid] = x[gid] * y[gid]; +} + + +int ingo_mp_mod_mult(const scalar_field_t *x, scalar_field_t *y, scalar_field_t *result, const unsigned n) +{ + ingo_mp_mod_mult_kernel<<<1, n>>>(x, y, result); + int error = cudaGetLastError(); + return error ? error : cudaDeviceSynchronize(); +} \ No newline at end of file diff --git a/icicle/utils/mont.cuh b/icicle/utils/mont.cuh new file mode 100644 index 000000000..a41071020 --- /dev/null +++ b/icicle/utils/mont.cuh @@ -0,0 +1,25 @@ +#pragma once + +#include "../appUtils/vector_manipulation/ve_mod_mult.cuh" + +template +int convert_montgomery(E *d_inout, size_t n_elments, bool is_into, cudaStream_t stream) +{ + // Set the grid and block dimensions + int num_threads = MAX_THREADS_PER_BLOCK; + int num_blocks = (n_elments + num_threads - 1) / num_threads; + E mont = is_into ? E::montgomery_r() : E::montgomery_r_inv(); + template_normalize_kernel<<>>(d_inout, n_elments, mont); + + return 0; //TODO: void with propper error handling +} + +template +int to_montgomery(E* d_inout, unsigned n, cudaStream_t stream) { + return convert_montgomery(d_inout, n, true, stream); +} + +template +int from_montgomery(E* d_inout, unsigned n, cudaStream_t stream){ + return convert_montgomery(d_inout, n, false, stream); +} \ No newline at end of file diff --git a/src/test_bn254.rs b/src/test_bn254.rs index ef86e3156..7cce80e7e 100644 --- a/src/test_bn254.rs +++ b/src/test_bn254.rs @@ -73,6 +73,16 @@ extern "C" { device_id: usize ) -> c_int; + fn ntt_inplace_coset_batch_cuda_bn254( + d_inout: DevicePointer, + d_twiddles: DevicePointer, + n: usize, + batch_size: usize, + inverse: bool, + is_coset: bool, + d_coset: DevicePointer, + device_id: usize) -> c_int; + fn interpolate_scalars_cuda_bn254( d_out: DevicePointer, d_evaluations: DevicePointer, @@ -651,6 +661,29 @@ pub fn evaluate_scalars_on_coset_batch_bn254( return res; } +//extern "C" int ntt_inplace_coset_batch_cuda_bn254(BN254::scalar_t* d_inout, BN254::scalar_t* d_twiddles, +// unsigned n, unsigned batch_size, bool inverse, bool is_coset, BN254::scalar_t* coset, size_t device_id = 0, cudaStream_t stream = 0) +pub fn ntt_inplace_coset_batch_bn254( + d_inout: &mut DeviceBuffer, + d_twiddles: &mut DeviceBuffer, + batch_size: usize, + inverse: bool, + d_coset: &mut DeviceBuffer, +) -> i32 { + unsafe { + ntt_inplace_coset_batch_cuda_bn254( + d_inout.as_device_ptr(), + d_twiddles.as_device_ptr(), + d_twiddles.len(), + batch_size, + inverse, + d_coset.len() > 0, + d_coset.as_device_ptr(), + 0 + ) + } +} + pub fn evaluate_points_on_coset_bn254( d_coefficients: &mut DeviceBuffer, d_domain: &mut DeviceBuffer, @@ -786,6 +819,22 @@ pub fn generate_random_points_bn254( .collect() } +pub fn generate_random_points100_bn254( + count: usize, + mut rng: Box, +) -> Vec { + let mut res = Vec::new(); + for i in 0..count{ + if (i<100) { + res.push(Point_BN254::from_ark(G1Projective_BN254::rand(&mut rng)).to_xy_strip_z()); + } + else { + res.push(res[i-100]); + } + } + return res; +} + pub fn generate_random_points_proj_bn254(count: usize, mut rng: Box) -> Vec { (0..count) .map(|_| Point_BN254::from_ark(G1Projective_BN254::rand(&mut rng))) @@ -898,12 +947,13 @@ pub(crate) mod tests_bn254 { #[test] fn test_msm() { - let test_sizes = [6, 9]; + let test_sizes = [24]; for pow2 in test_sizes { let count = 1 << pow2; let seed = None; // set Some to provide seed - let points = generate_random_points_bn254(count, get_rng_bn254(seed)); + // let points = generate_random_points_bn254(count, get_rng_bn254(seed)); + let points = generate_random_points100_bn254(count, get_rng_bn254(seed)); let scalars = generate_random_scalars_bn254(count, get_rng_bn254(seed)); let msm_result = msm_bn254(&points, &scalars, 0); @@ -1413,13 +1463,19 @@ pub(crate) mod tests_bn254 { let (_, _, mut d_large_domain) = set_up_scalars_bn254(0, log_test_size + 1, false); let mut d_coset_powers = build_domain_bn254(test_size, log_test_size + 1, false); + println!("d_coset_powers len {}", d_coset_powers.len()); + let mut d_evals_large = evaluate_scalars_batch_bn254(&mut d_coeffs, &mut d_large_domain, batch_size); let mut h_evals_large: Vec = (0..2 * test_size * batch_size).map(|_| ScalarField_BN254::zero()).collect(); d_evals_large.copy_to(&mut h_evals_large[..]).unwrap(); let mut d_evals = evaluate_scalars_batch_bn254(&mut d_coeffs, &mut d_domain, batch_size); let mut h_evals: Vec = (0..test_size * batch_size).map(|_| ScalarField_BN254::zero()).collect(); d_evals.copy_to(&mut h_evals[..]).unwrap(); - let mut d_evals_coset = evaluate_scalars_on_coset_batch_bn254(&mut d_coeffs, &mut d_domain, batch_size, &mut d_coset_powers); + + // let mut d_evals_coset = evaluate_scalars_on_coset_batch_bn254(&mut d_coeffs, &mut d_domain, batch_size, &mut d_coset_powers); + ntt_inplace_coset_batch_bn254(&mut d_coeffs, &mut d_domain, batch_size, false, &mut d_coset_powers); + let d_evals_coset = d_coeffs; + let mut h_evals_coset: Vec = (0..test_size * batch_size).map(|_| ScalarField_BN254::zero()).collect(); d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap(); @@ -1499,7 +1555,7 @@ pub(crate) mod tests_bn254 { assert_eq!(intoo, expected); } - #[test] + //#[test] #[allow(non_snake_case)] fn test_vec_point_mul() { let dummy_one = Point_BN254 {