Time conv learn CUDA optimization (neoml-lib#311)

* Add time convolution training via temporary matrix Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com> * Fix time conv learn add through temporary matrix Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com> * Add more detailed comment Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com> * Optimize trivial case in CUDA BlobTimeConvolutionLearnAdd Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com> * Add corner case tests for TimeConvLearnAdd Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com> * A little bit more detailed comment Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com> * Add missing ASSERT_EXPR Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com> Co-authored-by: Stanislav Angeliuk <59917951+SAngeliuk@users.noreply.github.com>
EulogeTie · Apr 23, 2021 · 412614f · 412614f
1 parent 4d59fc6
commit 412614f
Show file tree

Hide file tree

Showing 3 changed files with 59 additions and 6 deletions.
diff --git a/NeoMathEngine/src/GPU/CUDA/CudaMathEngineDnnTimeConv.cu b/NeoMathEngine/src/GPU/CUDA/CudaMathEngineDnnTimeConv.cu
@@ -187,11 +187,52 @@ void CCudaMathEngine::BlobTimeConvolutionLearnAdd( const CTimeConvolutionDesc& c
 	const CCudaBlobDesc& outputDiff = desc.Result;
 
 	// Train the filter
-	int blockCount;
-	int threadCount;
-	getCudaTaskGrid( blockCount, threadCount, desc.Filter.BlobSize() );
-	blobTimeConvolutionLearnFilterKernel<<<blockCount, threadCount>>>( desc, GetRaw( inputData ),
-		GetRaw( outputDiffData ), GetRaw( filterDiffData ) );
+	if( filterDiff.Height() == 1 && desc.Stride == 1 ) {
+		// This assert has already been checked in InitTimeConvolution
+		ASSERT_EXPR( desc.PaddingFront == 0 && desc.PaddingBack == 0 );
+		// Trivial case
+		MultiplyTransposedMatrixByMatrixAndAdd( outputDiffData, desc.Source.ObjectCount(),
+			outputDiff.ObjectSize(), outputDiff.ObjectSize(), inputData, desc.Source.ObjectSize(),
+			desc.Source.ObjectSize(), filterDiffData, filterDiff.ObjectSize(), filterDiff.BlobSize() );
+	} else {
+		// Let's try to build temp matrix
+		const int tempMatrixWidth = filterDiff.ObjectSize();
+		const int tempMatrixHeight = outputDiff.BlobSize() / filterDiff.ObjectCount();
+		// Max amount of memory allowed is a half of math engine's free memory
+		const int maxInMemoryHeight = min( static_cast<int>( GetFreeMemorySize() / 2 / ( sizeof( float ) * tempMatrixWidth ) ),
+			tempMatrixHeight );
+
+		if( maxInMemoryHeight == 0 ) {
+			// naive implementatino which doesn't use additional memory
+			int blockCount;
+			int threadCount;
+			getCudaTaskGrid( blockCount, threadCount, desc.Filter.BlobSize() );
+			BlobTimeConvolutionLearnFilterKernel<<<blockCount, threadCount>>>( desc, GetRaw( inputData ),
+				GetRaw( outputDiffData ), GetRaw( filterDiffData ) );
+		} else {
+			int matrixRowIndex = 0;
+			CFloatHandle currOutputDiff = outputDiffData;
+			CFloatHandleStackVar tempMatrixPart( mathEngine(), maxInMemoryHeight * tempMatrixWidth );
+			const int filterCount = desc.Result.ObjectSize();
+
+			// Build temp matrix part by part and add filterDiff of that part
+			while( matrixRowIndex < tempMatrixHeight ) {
+				const int currPartHeight = min( tempMatrixHeight - matrixRowIndex, maxInMemoryHeight );
+
+				dim3 blockCount;
+				dim3 threadCount;
+				getCudaTaskGrid2D( blockCount, threadCount, currPartHeight, tempMatrixWidth );
+
+				BuildTempMatrixKernel<<<blockCount, threadCount>>>( desc, GetRaw( inputData ), currPartHeight,
+					tempMatrixWidth, GetRaw( tempMatrixPart.GetHandle() ), matrixRowIndex );
+				MultiplyTransposedMatrixByMatrixAndAdd( currOutputDiff, currPartHeight, filterCount, filterCount,
+					tempMatrixPart.GetHandle(), tempMatrixWidth, tempMatrixWidth, filterDiffData, tempMatrixWidth, filterDiff.BlobSize() );
+
+				matrixRowIndex += currPartHeight;
+				currOutputDiff += currPartHeight * filterCount;
+			}
+		}
+	}
 
 	// Train the free term
 	SumMatrixRowsAdd( 1, freeTermDiffData, outputDiffData, outputDiff.ObjectCount(), filterDiff.ObjectCount() );

diff --git a/NeoMathEngine/src/GPU/CUDA/Kernels/CudaDnnTimeConvKernels.h b/NeoMathEngine/src/GPU/CUDA/Kernels/CudaDnnTimeConvKernels.h
@@ -123,7 +123,7 @@ __global__ void BlobTimeConvolutionBackwardUnpackKernel( const CCudaTimeConvolut
 	}
 }
 
-__global__ void blobTimeConvolutionLearnFilterKernel( CCudaTimeConvolutionDescInternal desc,
+__global__ void BlobTimeConvolutionLearnFilterKernel( CCudaTimeConvolutionDescInternal desc,
 	const float* __restrict__ input, const float* __restrict__ outputDiff, float* filterDiff )
 {
 	const int objectSize = desc.Filter.Channels();

diff --git a/NeoMathEngine/test/src/learn/BlobTimeConvolutionLearnAddTest.cpp b/NeoMathEngine/test/src/learn/BlobTimeConvolutionLearnAddTest.cpp
@@ -380,6 +380,18 @@ INSTANTIATE_TEST_CASE_P( CBlobTimeConvolutionLearnAddTestInstantiation, CBlobTim
 			"PaddingBack = (0 .. 3);"
 			"Dilation = (1 .. 5);"
 			"TestCount = 30"
+		),
+		CTestParams(
+			"BatchLength = 3;"
+			"BatchSize = 15;"
+			"ObjectSize = 3;"
+			"FilterSize = 1;"
+			"FilterCount = 7;"
+			"Stride = 1;"
+			"PaddingFront = 2;"
+			"PaddingBack = 3;"
+			"Dilation = 1;"
+			"TestCount = 1"
 		)
 	)
 );