Skip to content

Commit

Permalink
LPGEMM s32 micro-kernel updates to fix gcc10.2 compilation issue.
Browse files Browse the repository at this point in the history
Some AVX512 intrinsics(eg: _mm_loadu_epi8) were introduced in later
versions of gcc (11+) in addition to already existing masked intrinsic
(eg: _mm_mask_loadu_epi8). In order to support compilation using gcc
10.2, either the masked intrinsic or other gcc 10.2 compatible intrinsic
needs to be used (eg: _mm_loadu_si128) in LPGEMM <u|s>8s8os32 kernels.

AMD-Internal: [SWLCSG-2542]
Change-Id: I6cfedfdcb28711b19df63d162ab267f5eea8d2ef
  • Loading branch information
MithunMohanKadavil authored and BhaskarNallani committed Nov 24, 2023
1 parent 126a070 commit 4e493d3
Show file tree
Hide file tree
Showing 10 changed files with 371 additions and 247 deletions.
3 changes: 2 additions & 1 deletion kernels/zen4/lpgemm/bf16bf16f32/lpgemm_f32_kern_macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@

// Disable BF16 kernel in cases where compilers support other avx 512
// features except BF16 ISA.
#if defined( BLIS_GCC ) && ( __GNUC__ < 10 )
#if ( defined( BLIS_GCC ) && ( ( __GNUC__ < 11 ) || \
( ( __GNUC__ == 11 ) && ( __GNUC_MINOR__ < 2 ) ) ) )
#define LPGEMM_BF16_NOT_SUPPORTED
#endif

Expand Down
20 changes: 12 additions & 8 deletions kernels/zen4/lpgemm/s8s8s32/lpgemm_6x64rowmajor_s8_amd512vnni.c
Original file line number Diff line number Diff line change
Expand Up @@ -1062,17 +1062,21 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x64)

// int8_t zero point value.
__m128i zero_point0 =
_mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
_mm_loadu_si128( ( __m128i const* )
( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 0 * 16 ) ) );
__m128i zero_point1 =
_mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
_mm_loadu_si128( ( __m128i const* )
( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 1 * 16 ) ) );
__m128i zero_point2 =
_mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
_mm_loadu_si128( ( __m128i const* )
( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 2 * 16 ) ) );
__m128i zero_point3 =
_mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
_mm_loadu_si128( ( __m128i const* )
( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 3 * 16 ) ) );

// c[0, 0-15]
CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0);
Expand Down
100 changes: 60 additions & 40 deletions kernels/zen4/lpgemm/s8s8s32/lpgemm_m_fringe_s8_amd512vnni.c
Original file line number Diff line number Diff line change
Expand Up @@ -826,17 +826,21 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x64)
_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
__m128i zero_point0 =
_mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
_mm_loadu_si128( ( __m128i const* )
( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 0 * 16 ) ) );
__m128i zero_point1 =
_mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
_mm_loadu_si128( ( __m128i const* )
( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 1 * 16 ) ) );
__m128i zero_point2 =
_mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
_mm_loadu_si128( ( __m128i const* )
( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 2 * 16 ) ) );
__m128i zero_point3 =
_mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
_mm_loadu_si128( ( __m128i const* )
( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 3 * 16 ) ) );

// c[0, 0-15]
CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0);
Expand Down Expand Up @@ -1697,17 +1701,21 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x64)
_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
__m128i zero_point0 =
_mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
_mm_loadu_si128( ( __m128i const* )
( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 0 * 16 ) ) );
__m128i zero_point1 =
_mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
_mm_loadu_si128( ( __m128i const* )
( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 1 * 16 ) ) );
__m128i zero_point2 =
_mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
_mm_loadu_si128( ( __m128i const* )
( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 2 * 16 ) ) );
__m128i zero_point3 =
_mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
_mm_loadu_si128( ( __m128i const* )
( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 3 * 16 ) ) );

// c[0, 0-15]
CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0);
Expand Down Expand Up @@ -2406,17 +2414,21 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x64)
_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
__m128i zero_point0 =
_mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
_mm_loadu_si128( ( __m128i const* )
( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 0 * 16 ) ) );
__m128i zero_point1 =
_mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
_mm_loadu_si128( ( __m128i const* )
( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 1 * 16 ) ) );
__m128i zero_point2 =
_mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
_mm_loadu_si128( ( __m128i const* )
( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 2 * 16 ) ) );
__m128i zero_point3 =
_mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
_mm_loadu_si128( ( __m128i const* )
( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 3 * 16 ) ) );

// c[0, 0-15]
CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0);
Expand Down Expand Up @@ -2955,17 +2967,21 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x64)
_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
__m128i zero_point0 =
_mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
_mm_loadu_si128( ( __m128i const* )
( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 0 * 16 ) ) );
__m128i zero_point1 =
_mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
_mm_loadu_si128( ( __m128i const* )
( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 1 * 16 ) ) );
__m128i zero_point2 =
_mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
_mm_loadu_si128( ( __m128i const* )
( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 2 * 16 ) ) );
__m128i zero_point3 =
_mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
_mm_loadu_si128( ( __m128i const* )
( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 3 * 16 ) ) );

// c[0, 0-15]
CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0);
Expand Down Expand Up @@ -3341,17 +3357,21 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x64)
_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
__m128i zero_point0 =
_mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
_mm_loadu_si128( ( __m128i const* )
( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 0 * 16 ) ) );
__m128i zero_point1 =
_mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
_mm_loadu_si128( ( __m128i const* )
( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 1 * 16 ) ) );
__m128i zero_point2 =
_mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
_mm_loadu_si128( ( __m128i const* )
( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 2 * 16 ) ) );
__m128i zero_point3 =
_mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
_mm_loadu_si128( ( __m128i const* )
( ( int8_t* )post_ops_list_temp->op_args1 +
post_ops_attr.post_op_c_j + ( 3 * 16 ) ) );

// c[0, 0-15]
CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0);
Expand Down
Loading

0 comments on commit 4e493d3

Please sign in to comment.