[CudaMathEngine] Fix restrict modifier for function arguments (#1011)

Signed-off-by: Kirill Golikov <kirill.golikov@abbyy.com> Co-authored-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com>
neoml-lib · Dec 21, 2023 · 3c37aed · 3c37aed
1 parent bc3db75
commit 3c37aed
Show file tree

Hide file tree

Showing 7 changed files with 22 additions and 22 deletions.
diff --git a/NeoMathEngine/src/GPU/CUDA/Kernels/CudaBlasKernels.h b/NeoMathEngine/src/GPU/CUDA/Kernels/CudaBlasKernels.h
@@ -48,7 +48,7 @@ __global__ void AddVectorToMatrixElementsKernel( float* matrix, int height, int
 }
 
 const int AddVectorToMatrixElementsMulCombine = 4;
-__global__ void AddVectorToMatrixElementsKernel( float* matrix, int /*height*/, int width,
+__global__ void AddVectorToMatrixElementsKernel( float* __restrict__ matrix, int /*height*/, int width,
 	const int* __restrict__ rowIndices, const int* __restrict__ columnIndices,
 	const float* __restrict__ vector, int vectorSize )
 {
@@ -65,7 +65,7 @@ __global__ void AddVectorToMatrixElementsKernel( float* matrix, int /*height*/,
 // Assigns the values matrix[rowIndices[i], columnIndices[i]] = vector[i].
 const int SetVectorToMatrixElementsMulCombine = 4;
 __global__ void SetVectorToMatrixElementsKernel(
-	float* matrix, int /*height*/, int width,
+	float* __restrict__ matrix, int /*height*/, int width,
 	const int* __restrict__ rowIndices, const int* __restrict__ columnIndices,
 	const float* __restrict__ vector, int vectorSize )
 {
@@ -314,7 +314,7 @@ __global__ void MatrixLogSumExpByRowsKernel(const float* __restrict__ matrix, in
 }
 
 const int MatrixSoftmaxByRowsCombine = 2;
-__global__ void MatrixSoftmaxByRowsKernel(const float* __restrict__ matrix,
+__global__ void MatrixSoftmaxByRowsKernel(const float* matrix,
 	int height, int width, float* result, int widthNorm)
 {
 	extern __shared__  float buffer[];

diff --git a/NeoMathEngine/src/GPU/CUDA/Kernels/CudaDnn3dPoolingKernels.h b/NeoMathEngine/src/GPU/CUDA/Kernels/CudaDnn3dPoolingKernels.h
@@ -23,7 +23,7 @@ limitations under the License.
 namespace NeoML {
 
 __global__ void Blob3dMaxPoolingKernel( const CCuda3dMaxPoolingDescInternal desc, const float* __restrict__ sourceData,
-	int* maxIndices, float* resultData )
+	int* __restrict__ maxIndices, float* __restrict__ resultData )
 {
 	const CCudaBlobDesc& result = desc.Result;
 	const CCudaBlobDesc& source = desc.Source;
@@ -83,8 +83,8 @@ __global__ void Blob3dMaxPoolingKernel( const CCuda3dMaxPoolingDescInternal desc
 	}
 }
 
-__global__ void Blob3dMaxPoolingBackwardKernel( const CCuda3dMaxPoolingDescInternal desc, const float* resultDiff,
-	const int* maxIndices, float* sourceDiff, bool isAtomic )
+__global__ void Blob3dMaxPoolingBackwardKernel( const CCuda3dMaxPoolingDescInternal desc, const float* __restrict__ resultDiff,
+	const int* __restrict__ maxIndices, float* __restrict__ sourceDiff, bool isAtomic )
 {
 	const CCudaBlobDesc& result = desc.Result;
 	const CCudaBlobDesc& source = desc.Source;

diff --git a/NeoMathEngine/src/GPU/CUDA/Kernels/CudaDnnCtcKernels.h b/NeoMathEngine/src/GPU/CUDA/Kernels/CudaDnnCtcKernels.h
@@ -32,7 +32,7 @@ __global__ void CtcFillPaddingKernel( int maxSeqLen, int batchSize, int classCou
 
 const int CtcMatrixLogSumExpByColumnsCombine = 2;
 __global__ void CtcMatrixLogSumExpByColumnsKernel(int batchSize, const float* __restrict__ matrix, int height, int width,
-	float* result, int heightNorm)
+	float* __restrict__ result, int heightNorm)
 {
 	extern __shared__  float buffer[];
 	float& my = buffer[(threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];

diff --git a/NeoMathEngine/src/GPU/CUDA/Kernels/CudaDnnDropoutKernels.h b/NeoMathEngine/src/GPU/CUDA/Kernels/CudaDnnDropoutKernels.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 namespace NeoML {
 
-__global__ void RandomMatrixDropout( const float* __restrict__ first, int firstHeight,
+__global__ void RandomMatrixDropout( const float* first, int firstHeight,
 	int firstWidth, float* res, int seed, float forwardRate )
 {
 	const unsigned int threshold = forwardRate * UINT_MAX;
@@ -39,7 +39,7 @@ __global__ void RandomMatrixDropout( const float* __restrict__ first, int firstH
 	}
 }
 
-__global__ void RandomSpatialDropout( const float* __restrict__ input, float* res, int inputObjectCount,
+__global__ void RandomSpatialDropout( const float* input, float* res, int inputObjectCount,
 	int inputObjectSize, int maskObjectCount, int maskObjectSize, int seed, float forwardRate )
 {
 	const unsigned int threshold = forwardRate * UINT_MAX;

diff --git a/NeoMathEngine/src/GPU/CUDA/Kernels/CudaDnnGlobalPoolingKernels.h b/NeoMathEngine/src/GPU/CUDA/Kernels/CudaDnnGlobalPoolingKernels.h
@@ -417,7 +417,7 @@ __global__ void BlobGlobalMaxPoolingGlobalShuffleKernel( const CCudaGlobalMaxPoo
 
 const int BlobGlobalMaxPoolingBackwardCombine = 8;
 __global__ void BlobGlobalMaxPoolingBackwardKernel( const CCudaGlobalMaxPoolingDescInternal desc, const float* __restrict__ resultDiff,
-	const int* maxIndices, float* sourceDiff, int poolSize, int maxCount, int fullSize )
+	const int* __restrict__ maxIndices, float* __restrict__ sourceDiff, int poolSize, int maxCount, int fullSize )
 {
 	int index;
 	int step;

diff --git a/NeoMathEngine/src/GPU/CUDA/Kernels/CudaDnnGlobalTimePoolingKernels.h b/NeoMathEngine/src/GPU/CUDA/Kernels/CudaDnnGlobalTimePoolingKernels.h
@@ -80,7 +80,7 @@ __global__ void BlobGlobalMaxOverTimePoolingKernel( const CCudaGlobalMaxOverTime
 }
 
 __global__ void BlobGlobalMaxOverTimePoolingBackwardKernel( const CCudaGlobalMaxOverTimePoolingDescInternal desc,
-	const float* __restrict__ resultDiff, const int* __restrict__ maxIndicesData, float* sourceDiff )
+	const float* __restrict__ resultDiff, const int* __restrict__ maxIndicesData, float* __restrict__ sourceDiff )
 {
 	const CCudaBlobDesc& result = desc.Result;
 	int pos;

diff --git a/NeoMathEngine/src/GPU/CUDA/Kernels/CudaVectorMathKernels.h b/NeoMathEngine/src/GPU/CUDA/Kernels/CudaVectorMathKernels.h
@@ -254,8 +254,8 @@ __global__ void VectorCumSumAlongDimensionDiagKernel( const float* __restrict__
 }
 
 const int VectorEqualCombineCount = 16;
-__global__ void VectorEqualKernel( const int* __restrict__ first,
-	const int* __restrict__ second, float* __restrict__ result, int count )
+__global__ void VectorEqualKernel( const int* first,
+	const int* second, float* result, int count )
 {
 	int index;
 	int step;
@@ -273,8 +273,8 @@ __global__ void VectorEqualKernel( const int* __restrict__ first,
 	}
 }
 
-__global__ void VectorEqualValueKernel( const int* __restrict__ first, 
-	float* __restrict__ result, int count, const int* __restrict__ value )
+__global__ void VectorEqualValueKernel( const int* first, 
+	float* result, int count, const int* __restrict__ value )
 {
 	int index;
 	int step;
@@ -346,7 +346,7 @@ __global__ void VectorELUDiffOpKernel( const float* __restrict__ first, const fl
 	}
 }
 
-__global__ void VectorReLUKernel(const float* __restrict__ first, float* result,
+__global__ void VectorReLUKernel(const float* first, float* result,
 	int count, const float* __restrict__ threshold)
 {
 	int index;
@@ -435,7 +435,7 @@ __global__ void VectorLeakyReLUDiffKernel( const float* __restrict__ first, cons
 	}
 }
 
-__global__ void VectorHSwishKernel( const float* __restrict__ first, float* result, int count )
+__global__ void VectorHSwishKernel( const float* first, float* result, int count )
 {
 	int index;
 	int step;
@@ -484,7 +484,7 @@ __global__ void VectorHSwishDiffKernel( const float* __restrict__ first, const f
 	}
 }
 const int VectorEltwiseMaxCombineCount = 8;
-__global__ void VectorEltwiseMaxKernel(const float* __restrict__ first, const float* __restrict__ second,
+__global__ void VectorEltwiseMaxKernel(const float* first, const float* second,
 	float* result, int count)
 {
 	int index;
@@ -506,7 +506,7 @@ __global__ void VectorEltwiseMaxKernel(const float* __restrict__ first, const fl
 }
 
 const int VectorEltwiseMinCombineCount = 8;
-__global__ void VectorEltwiseMinKernel(const float* __restrict__ first, const float* __restrict__ second,
+__global__ void VectorEltwiseMinKernel(const float* first, const float* second,
 	float* result, int count)
 {
 	int index;
@@ -527,7 +527,7 @@ __global__ void VectorEltwiseMinKernel(const float* __restrict__ first, const fl
 	}
 }
 
-__global__ void VectorAbsKernel(const float* __restrict__ first, float* result, int count)
+__global__ void VectorAbsKernel(const float* first, float* result, int count)
 {
 	int index;
 	int step;
@@ -1437,7 +1437,7 @@ __global__ void VectorLogDiffKernel( const float* __restrict__ sourceGrad,
 }
 
 const int VectorAbsDiffCombine = 16;
-__global__ void VectorAbsDiffKernel( const float* __restrict__ sourceGrad,
+__global__ void VectorAbsDiffKernel( const float* sourceGrad,
 	int gradCount, int gradSize, int gradNorm,
 	const float* __restrict__ first, float* resultGrad )
 {
@@ -1462,7 +1462,7 @@ __global__ void VectorAbsDiffKernel( const float* __restrict__ sourceGrad,
 }
 
 const int VectorMinMaxDiffCombine = 16;
-__global__ void VectorMinMaxDiffKernel( const float* __restrict__ sourceGrad,
+__global__ void VectorMinMaxDiffKernel( const float* sourceGrad,
 	int gradCount, int gradSize, int gradNorm,
 	const float* __restrict__ first, float* resultGrad,
 	const float* __restrict__ minPtr, const float* __restrict__ maxPtr )