Skip to content

Commit

Permalink
Time conv learn CUDA optimization (neoml-lib#311)
Browse files Browse the repository at this point in the history
* Add time convolution training via temporary matrix

Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com>

* Fix time conv learn add through temporary matrix

Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com>

* Add more detailed comment

Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com>

* Optimize trivial case in CUDA BlobTimeConvolutionLearnAdd

Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com>

* Add corner case tests for TimeConvLearnAdd

Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com>

* A little bit more detailed comment

Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com>

* Add missing ASSERT_EXPR

Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com>

Co-authored-by: Stanislav Angeliuk <59917951+SAngeliuk@users.noreply.github.com>
  • Loading branch information
Valeriy Fedyunin and SAngeliuk authored Apr 23, 2021
1 parent 4d59fc6 commit 412614f
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 6 deletions.
51 changes: 46 additions & 5 deletions NeoMathEngine/src/GPU/CUDA/CudaMathEngineDnnTimeConv.cu
Original file line number Diff line number Diff line change
Expand Up @@ -187,11 +187,52 @@ void CCudaMathEngine::BlobTimeConvolutionLearnAdd( const CTimeConvolutionDesc& c
const CCudaBlobDesc& outputDiff = desc.Result;

// Train the filter
int blockCount;
int threadCount;
getCudaTaskGrid( blockCount, threadCount, desc.Filter.BlobSize() );
blobTimeConvolutionLearnFilterKernel<<<blockCount, threadCount>>>( desc, GetRaw( inputData ),
GetRaw( outputDiffData ), GetRaw( filterDiffData ) );
if( filterDiff.Height() == 1 && desc.Stride == 1 ) {
// This assert has already been checked in InitTimeConvolution
ASSERT_EXPR( desc.PaddingFront == 0 && desc.PaddingBack == 0 );
// Trivial case
MultiplyTransposedMatrixByMatrixAndAdd( outputDiffData, desc.Source.ObjectCount(),
outputDiff.ObjectSize(), outputDiff.ObjectSize(), inputData, desc.Source.ObjectSize(),
desc.Source.ObjectSize(), filterDiffData, filterDiff.ObjectSize(), filterDiff.BlobSize() );
} else {
// Let's try to build temp matrix
const int tempMatrixWidth = filterDiff.ObjectSize();
const int tempMatrixHeight = outputDiff.BlobSize() / filterDiff.ObjectCount();
// Max amount of memory allowed is a half of math engine's free memory
const int maxInMemoryHeight = min( static_cast<int>( GetFreeMemorySize() / 2 / ( sizeof( float ) * tempMatrixWidth ) ),
tempMatrixHeight );

if( maxInMemoryHeight == 0 ) {
// naive implementatino which doesn't use additional memory
int blockCount;
int threadCount;
getCudaTaskGrid( blockCount, threadCount, desc.Filter.BlobSize() );
BlobTimeConvolutionLearnFilterKernel<<<blockCount, threadCount>>>( desc, GetRaw( inputData ),
GetRaw( outputDiffData ), GetRaw( filterDiffData ) );
} else {
int matrixRowIndex = 0;
CFloatHandle currOutputDiff = outputDiffData;
CFloatHandleStackVar tempMatrixPart( mathEngine(), maxInMemoryHeight * tempMatrixWidth );
const int filterCount = desc.Result.ObjectSize();

// Build temp matrix part by part and add filterDiff of that part
while( matrixRowIndex < tempMatrixHeight ) {
const int currPartHeight = min( tempMatrixHeight - matrixRowIndex, maxInMemoryHeight );

dim3 blockCount;
dim3 threadCount;
getCudaTaskGrid2D( blockCount, threadCount, currPartHeight, tempMatrixWidth );

BuildTempMatrixKernel<<<blockCount, threadCount>>>( desc, GetRaw( inputData ), currPartHeight,
tempMatrixWidth, GetRaw( tempMatrixPart.GetHandle() ), matrixRowIndex );
MultiplyTransposedMatrixByMatrixAndAdd( currOutputDiff, currPartHeight, filterCount, filterCount,
tempMatrixPart.GetHandle(), tempMatrixWidth, tempMatrixWidth, filterDiffData, tempMatrixWidth, filterDiff.BlobSize() );

matrixRowIndex += currPartHeight;
currOutputDiff += currPartHeight * filterCount;
}
}
}

// Train the free term
SumMatrixRowsAdd( 1, freeTermDiffData, outputDiffData, outputDiff.ObjectCount(), filterDiff.ObjectCount() );
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ __global__ void BlobTimeConvolutionBackwardUnpackKernel( const CCudaTimeConvolut
}
}

__global__ void blobTimeConvolutionLearnFilterKernel( CCudaTimeConvolutionDescInternal desc,
__global__ void BlobTimeConvolutionLearnFilterKernel( CCudaTimeConvolutionDescInternal desc,
const float* __restrict__ input, const float* __restrict__ outputDiff, float* filterDiff )
{
const int objectSize = desc.Filter.Channels();
Expand Down
12 changes: 12 additions & 0 deletions NeoMathEngine/test/src/learn/BlobTimeConvolutionLearnAddTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,18 @@ INSTANTIATE_TEST_CASE_P( CBlobTimeConvolutionLearnAddTestInstantiation, CBlobTim
"PaddingBack = (0 .. 3);"
"Dilation = (1 .. 5);"
"TestCount = 30"
),
CTestParams(
"BatchLength = 3;"
"BatchSize = 15;"
"ObjectSize = 3;"
"FilterSize = 1;"
"FilterCount = 7;"
"Stride = 1;"
"PaddingFront = 2;"
"PaddingBack = 3;"
"Dilation = 1;"
"TestCount = 1"
)
)
);
Expand Down

0 comments on commit 412614f

Please sign in to comment.