Optimize IMathEngine::TransposeMatrix (neoml-lib#655)

* Copy data instead of transpose (where possible) Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com> * Optimize CPU transpose even further Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com> * Reduce diff size Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com>
zimka · Jun 16, 2022 · 662b1a9 · 662b1a9
1 parent 557cf5b
commit 662b1a9
Show file tree

Hide file tree

Showing 4 changed files with 42 additions and 0 deletions.
diff --git a/NeoMathEngine/src/CPU/CpuMathEngineBlas.cpp b/NeoMathEngine/src/CPU/CpuMathEngineBlas.cpp
@@ -106,6 +106,22 @@ template<class T>
 inline void CCpuMathEngine::transposeMatrixImpl( int batchSize, const T* first,
 	int height, int medium, int width, int channels, T* result )
 {
+	// Transpose B x 1 x M x W x C -> B x W x M x 1 x C
+	// is equivalent to B x M x 1 x W x C -> B x W x 1 x M x C
+	if( medium != 1 && height == 1 ) {
+		swap( medium, height );
+	}
+
+	// Same goes for W == 1 && H != 1
+	if( medium != 1 && width == 1 ) {
+		swap( medium, width );
+	}
+
+	if( medium == 1 && ( height == 1 || width == 1 ) ) {
+		dataCopy( result, first, batchSize * height * medium * width * channels );
+		return;
+	}
+
 	if( medium == 1 && channels == 1 ) {
 		static_assert( sizeof(float) == sizeof(T), "Size of float isn't equal to size of T." );
 		batchTransposePlainMatrix( batchSize, reinterpret_cast<const float*>( first ),

diff --git a/NeoMathEngine/src/GPU/CUDA/CudaMathEngineBlas.cu b/NeoMathEngine/src/GPU/CUDA/CudaMathEngineBlas.cu
@@ -833,6 +833,12 @@ void CCudaMathEngine::transposeMatrixImpl(int batchSize, const CTypedMemoryHandl
 {
 	int size = batchSize * height * medium * width * channels;
 	ASSERT_EXPR(resultBufferSize >= size);
+
+	if( medium == 1 && ( height == 1 || width == 1 ) ) {
+		VectorCopy( resultHandle, firstHandle, size );
+		return;
+	}
+
 	SetCudaDevice( device->DeviceNumber );
 
 	int blockCount;

diff --git a/NeoMathEngine/src/GPU/Metal/MetalMathEngineBlas.mm b/NeoMathEngine/src/GPU/Metal/MetalMathEngineBlas.mm
@@ -966,6 +966,11 @@ C2DKernel kernel( *queue, "matrixKernelMultiplyMatrixByTransposedMatrixThread4x4
     ASSERT_EXPR( firstHandle.GetMathEngine() == this );
 	ASSERT_EXPR( resultHandle.GetMathEngine() == this );
 
+	if( medium == 1 && ( height == 1 || width == 1 ) ) {
+		VectorCopy( resultHandle, firstHandle, batchSize * height * medium * width * channels );
+		return;
+	}
+
     const int size = batchSize * height * medium * width * channels;
     ASSERT_EXPR( resultBufferSize >= size );
 
@@ -987,6 +992,11 @@ C2DKernel kernel( *queue, "matrixKernelMultiplyMatrixByTransposedMatrixThread4x4
     ASSERT_EXPR( firstHandle.GetMathEngine() == this );
 	ASSERT_EXPR( resultHandle.GetMathEngine() == this );
 
+	if( medium == 1 && ( height == 1 || width == 1 ) ) {
+		VectorCopy( resultHandle, firstHandle, batchSize * height * medium * width * channels );
+		return;
+	}
+
     const int size = batchSize * height * medium * width * channels;
     ASSERT_EXPR( resultBufferSize >= size );
 

diff --git a/NeoMathEngine/src/GPU/Vulkan/VulkanMathEngineBlas.cpp b/NeoMathEngine/src/GPU/Vulkan/VulkanMathEngineBlas.cpp
@@ -109,6 +109,11 @@ void CVulkanMathEngine::AddDiagMatrixToMatrix( const CConstFloatHandle& diagMatr
 void CVulkanMathEngine::TransposeMatrix( int batchSize, const CConstFloatHandle& firstHandle,
 	int height, int medium, int width, int channels, const CFloatHandle& resultHandle, int /*resultBufferSize*/ )
 {
+	if( medium == 1 && ( height == 1 || width == 1 ) ) {
+		VectorCopy( resultHandle, firstHandle, batchSize * height * medium * width * channels );
+		return;
+	}
+
 	int vectorSize = batchSize * height * medium * width * channels;
 	CMemoryHandle bufs[2] = { firstHandle, resultHandle };
 	size_t sizes[2] = { vectorSize * sizeof(float), vectorSize * sizeof(float) };
@@ -122,6 +127,11 @@ void CVulkanMathEngine::TransposeMatrix( int batchSize, const CConstFloatHandle&
 void CVulkanMathEngine::TransposeMatrix( int batchSize, const CConstIntHandle& firstHandle,
 	int height, int medium, int width, int channels, const CIntHandle& resultHandle, int /*resultBufferSize*/ )
 {
+	if( medium == 1 && ( height == 1 || width == 1 ) ) {
+		VectorCopy( resultHandle, firstHandle, batchSize * height * medium * width * channels );
+		return;
+	}
+
 	int vectorSize = batchSize * height * medium * width * channels;
 	CMemoryHandle bufs[2] = { firstHandle, resultHandle };
 	size_t sizes[2] = { vectorSize * sizeof(int), vectorSize * sizeof(int) };