Skip to content

Commit

Permalink
Optimize IMathEngine::TransposeMatrix (neoml-lib#655)
Browse files Browse the repository at this point in the history
* Copy data instead of transpose (where possible)

Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com>

* Optimize CPU transpose even further

Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com>

* Reduce diff size

Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com>
  • Loading branch information
Valeriy Fedyunin authored Jun 16, 2022
1 parent 557cf5b commit 662b1a9
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 0 deletions.
16 changes: 16 additions & 0 deletions NeoMathEngine/src/CPU/CpuMathEngineBlas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,22 @@ template<class T>
inline void CCpuMathEngine::transposeMatrixImpl( int batchSize, const T* first,
int height, int medium, int width, int channels, T* result )
{
// Transpose B x 1 x M x W x C -> B x W x M x 1 x C
// is equivalent to B x M x 1 x W x C -> B x W x 1 x M x C
if( medium != 1 && height == 1 ) {
swap( medium, height );
}

// Same goes for W == 1 && H != 1
if( medium != 1 && width == 1 ) {
swap( medium, width );
}

if( medium == 1 && ( height == 1 || width == 1 ) ) {
dataCopy( result, first, batchSize * height * medium * width * channels );
return;
}

if( medium == 1 && channels == 1 ) {
static_assert( sizeof(float) == sizeof(T), "Size of float isn't equal to size of T." );
batchTransposePlainMatrix( batchSize, reinterpret_cast<const float*>( first ),
Expand Down
6 changes: 6 additions & 0 deletions NeoMathEngine/src/GPU/CUDA/CudaMathEngineBlas.cu
Original file line number Diff line number Diff line change
Expand Up @@ -833,6 +833,12 @@ void CCudaMathEngine::transposeMatrixImpl(int batchSize, const CTypedMemoryHandl
{
int size = batchSize * height * medium * width * channels;
ASSERT_EXPR(resultBufferSize >= size);

if( medium == 1 && ( height == 1 || width == 1 ) ) {
VectorCopy( resultHandle, firstHandle, size );
return;
}

SetCudaDevice( device->DeviceNumber );

int blockCount;
Expand Down
10 changes: 10 additions & 0 deletions NeoMathEngine/src/GPU/Metal/MetalMathEngineBlas.mm
Original file line number Diff line number Diff line change
Expand Up @@ -966,6 +966,11 @@ C2DKernel kernel( *queue, "matrixKernelMultiplyMatrixByTransposedMatrixThread4x4
ASSERT_EXPR( firstHandle.GetMathEngine() == this );
ASSERT_EXPR( resultHandle.GetMathEngine() == this );

if( medium == 1 && ( height == 1 || width == 1 ) ) {
VectorCopy( resultHandle, firstHandle, batchSize * height * medium * width * channels );
return;
}

const int size = batchSize * height * medium * width * channels;
ASSERT_EXPR( resultBufferSize >= size );

Expand All @@ -987,6 +992,11 @@ C2DKernel kernel( *queue, "matrixKernelMultiplyMatrixByTransposedMatrixThread4x4
ASSERT_EXPR( firstHandle.GetMathEngine() == this );
ASSERT_EXPR( resultHandle.GetMathEngine() == this );

if( medium == 1 && ( height == 1 || width == 1 ) ) {
VectorCopy( resultHandle, firstHandle, batchSize * height * medium * width * channels );
return;
}

const int size = batchSize * height * medium * width * channels;
ASSERT_EXPR( resultBufferSize >= size );

Expand Down
10 changes: 10 additions & 0 deletions NeoMathEngine/src/GPU/Vulkan/VulkanMathEngineBlas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,11 @@ void CVulkanMathEngine::AddDiagMatrixToMatrix( const CConstFloatHandle& diagMatr
void CVulkanMathEngine::TransposeMatrix( int batchSize, const CConstFloatHandle& firstHandle,
int height, int medium, int width, int channels, const CFloatHandle& resultHandle, int /*resultBufferSize*/ )
{
if( medium == 1 && ( height == 1 || width == 1 ) ) {
VectorCopy( resultHandle, firstHandle, batchSize * height * medium * width * channels );
return;
}

int vectorSize = batchSize * height * medium * width * channels;
CMemoryHandle bufs[2] = { firstHandle, resultHandle };
size_t sizes[2] = { vectorSize * sizeof(float), vectorSize * sizeof(float) };
Expand All @@ -122,6 +127,11 @@ void CVulkanMathEngine::TransposeMatrix( int batchSize, const CConstFloatHandle&
void CVulkanMathEngine::TransposeMatrix( int batchSize, const CConstIntHandle& firstHandle,
int height, int medium, int width, int channels, const CIntHandle& resultHandle, int /*resultBufferSize*/ )
{
if( medium == 1 && ( height == 1 || width == 1 ) ) {
VectorCopy( resultHandle, firstHandle, batchSize * height * medium * width * channels );
return;
}

int vectorSize = batchSize * height * medium * width * channels;
CMemoryHandle bufs[2] = { firstHandle, resultHandle };
size_t sizes[2] = { vectorSize * sizeof(int), vectorSize * sizeof(int) };
Expand Down

0 comments on commit 662b1a9

Please sign in to comment.