Skip to content

Commit 532eab1

Browse files
V, Varshavarshavenkit
andauthored
Bug Fixes in LPGEMM for AVX512(SkyLake) machine (#24)
* Bug Fixes in LPGEMM for AVX512(SkyLake) machine - B-matrix in bf16bf16f32obf16/f32 API is re-ordered. For machines that doesn't support BF16 instructions, the BF16 input is unre-ordered and converted to FP32 to use FP32 kernels. - For n = 1 and k = 1 sized matrices, re-ordering in BF16 is copying the matrix to the re-ordered buffer array. But the un-reordering to FP32 requires the matrix to have size multiple of 16 along n and multiple of 2 along k dimension. - The entry condition to the above has been modified for AVX512 configuration. - In bf16 API, the tiny path entry check has been modified to prevent seg fault while AOCL_ENABLE_INSTRUCTIONS=AVX2 is set in BF16 supporting machines. - Modified existing store instructions in FP32 AVX512 kernels to support execution in machines that has AVX512 support but not BF16/VNNI(SkyLake). - Added Bf16 beta and store types in FP32 avx512_256 kernels AMD Internal: [SWLCSG-3552] * Bug Fixes in LPGEMM for AVX512(SkyLake) machine - B-matrix in bf16bf16f32obf16/f32 API is re-ordered. For machines that doesn't support BF16 instructions, the BF16 input is unre-ordered and converted to FP32 to use FP32 kernels. - For n = 1 and k = 1 sized matrices, re-ordering in BF16 is copying the matrix to the re-ordered buffer array. But the un-reordering to FP32 requires the matrix to have size multiple of 16 along n and multiple of 2 along k dimension. - The entry condition to the above has been modified for AVX512 configuration. - In bf16 API, the tiny path entry check has been modified to prevent seg fault while AOCL_ENABLE_INSTRUCTIONS=AVX2 is set in BF16 supporting machines. - Modified existing store instructions in FP32 AVX512 kernels to support execution in machines that has AVX512 support but not BF16/VNNI(SkyLake). - Added Bf16 beta and store types, along with BIAS and ZP in FP32 avx512_256 kernels AMD Internal: [SWLCSG-3552] * Bug Fixes in LPGEMM for AVX512(SkyLake) machine - Support added in FP32 512_256 kerenls for : Beta, BIAS, Zero-point and BF16 store types for bf16bf16f32obf16 API execution in AVX2 mode. - B-matrix in bf16bf16f32obf16/f32 API is re-ordered. For machines that doesn't support BF16 instructions, the BF16 input is unre-ordered and converted to FP32 type to use FP32 kernels. - For n = 1 and k = 1 sized matrices, re-ordering in BF16 is copying the matrix to the re-ordered buffer array. But the un-reordering to FP32 requires the matrix to have size multiple of 16 along n and multiple of 2 along k dimension. The entry condition here has been modified for AVX512 configuration. - Fix for seg fault with AOCL_ENABLE_INSTRUCTIONS=AVX2 mode in BF16/VNNI ISA supporting configruations: - BF16 tiny path entry check has been modified to take into account arch_id to ensure improper entry into the tiny kernel. - The store in BF16->FP32 col-major for m = 1 conditions were updated to correct storage pattern, - BF16 beta load macro was modified to account for data in unaligned memory. - Modified existing store instructions in FP32 AVX512 kernels to support execution in machines that has AVX512 support but not BF16/VNNI(SkyLake) AMD Internal: [SWLCSG-3552] --------- Co-authored-by: VarshaV <[email protected]>
1 parent 62d4fcb commit 532eab1

12 files changed

+1403
-751
lines changed

addon/aocl_gemm/aocl_gemm_bf16_utils.c

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ AOCL_GEMM_REORDER(bfloat16, bf16bf16f32of32_reference)
100100
}
101101

102102
#if (defined(BLIS_KERNELS_ZEN4) && (!defined(LPGEMM_BF16_JIT)))
103-
if( ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) && ( n == 1 ) )
103+
if( ( n == 1 ) && ( bli_cpuid_is_avx512bf16_supported() == TRUE ) && ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) )
104104
{
105105
if( rs_b == 1 )
106106
{
@@ -260,7 +260,7 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(bf16bf16f32of32)
260260
/*It is expected that while bf16 input is passed to AVX2 kernels,
261261
the unreorder/conversion of bf16->f32 is done, which expects the
262262
reordered matrix to be padded with n multiple of 16, k multiple of 2. */
263-
if( ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) && ( n == 1 ) )
263+
if( ( n == 1 ) && ( bli_cpuid_is_avx512bf16_supported() == TRUE ) && ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) )
264264
{
265265
n_reorder = 1;
266266
}
@@ -271,7 +271,7 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(bf16bf16f32of32)
271271

272272
// Extra space since packing does length in multiples of 2.
273273
dim_t k_reorder;
274-
if( ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) && ( n == 1 ) )
274+
if( ( n == 1 ) && ( bli_cpuid_is_avx512bf16_supported() == TRUE ) && ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) )
275275
{
276276
k_reorder = k;
277277
}
@@ -342,7 +342,6 @@ AOCL_GEMM_REORDER(bfloat16, bf16bf16f32of32)
342342
"cannot perform bf16bf16f32/f32f32f32 gemm.", __FILE__, __LINE__ );
343343
return; // Error.
344344
}
345-
346345
aocl_reorder_bf16bf16f32of32_reference( order,trans ,mat_type, input_buf_addr,
347346
reorder_buf_addr, k, n, ldb );
348347

@@ -755,5 +754,4 @@ AOCL_GEMM_REORDER(int8_t, bf16s4f32of32)
755754
b.mat_type = input_mat_type;
756755

757756
reorderb_nr64_bf16s4f32of32(&b, &b_reorder, &rntm_g, lcntx_g);
758-
}
759-
757+
}

addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -229,10 +229,14 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
229229
bli_pba_rntm_set_pba( &rntm_g );
230230

231231
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( BF16BF16F32OF32 );
232-
233232
#if (defined(BLIS_KERNELS_ZEN4) && (!defined(LPGEMM_BF16_JIT)))
234-
235-
if( ( bli_cpuid_is_avx512bf16_supported() == TRUE ) && ( is_single_thread( &rntm_g ) == TRUE) )
233+
/* While AOCL_ENABLE_INSTRUCTIONS=AVX2 is enabled in machines that supports BF16/VNNI
234+
* with only the ISA check the exeution could enter tiny path and result in seg fault
235+
* as the tiny path for BF16->FP32 is not available. Hence the arch_id also has to be
236+
* verified here.
237+
*/
238+
arch_t arch_id = bli_arch_query_id();
239+
if( ( bli_cpuid_is_avx512bf16_supported() == TRUE ) && ( ( arch_id == BLIS_ARCH_ZEN4 ) || ( arch_id == BLIS_ARCH_ZEN5 ) ) && ( is_single_thread( &rntm_g ) == TRUE) )
236240
{
237241
if( ( is_row_major == TRUE ) &&
238242
( is_tiny_input_bf16obf16( m, n, k, lcntx_g ) == TRUE ) )
@@ -326,4 +330,4 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
326330

327331
err_hndl:;
328332
LPGEMM_STOP_LOGGER();
329-
}
333+
}

addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -235,8 +235,13 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32)
235235
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( BF16BF16F32OF32 );
236236

237237
#if (defined(BLIS_KERNELS_ZEN4) && (!defined(LPGEMM_BF16_JIT)))
238-
239-
if ( ( bli_cpuid_is_avx512bf16_supported() == TRUE ) &&
238+
/* While AOCL_ENABLE_INSTRUCTIONS=AVX2 is enabled in machines that supports BF16/VNNI
239+
* with only the ISA check the exeution could enter tiny path and result in seg fault
240+
* as the tiny path for BF16->FP32 is not available. Hence the arch_id also has to be
241+
* verified here.
242+
*/
243+
arch_t arch_id = bli_arch_query_id();
244+
if( ( bli_cpuid_is_avx512bf16_supported() == TRUE ) && ( ( arch_id == BLIS_ARCH_ZEN4 ) || ( arch_id == BLIS_ARCH_ZEN5 ) ) &&
240245
( is_tiny_input_bf16of32( m, n, k, lcntx_g ) == TRUE ) &&
241246
( is_single_thread( &rntm_g ) == TRUE) &&
242247
( is_row_major == TRUE ) )
@@ -315,4 +320,4 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32)
315320

316321
err_hndl:;
317322
LPGEMM_STOP_LOGGER();
318-
}
323+
}

addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -799,6 +799,7 @@ LPGEMM_5LOOP_AVX2(bfloat16,bfloat16,float,bf16bf16f32of32)
799799

800800
dim_t ic_start, ic_end;
801801
bli_thread_range_sub( &thread_ic, m, MR, FALSE, &ic_start, &ic_end );
802+
802803
for ( dim_t jc = jc_start; jc < jc_end; jc += NC )
803804
{
804805
dim_t nc0 = bli_min( ( jc_end - jc ), NC );
@@ -898,7 +899,6 @@ LPGEMM_5LOOP_AVX2(bfloat16,bfloat16,float,bf16bf16f32of32)
898899
bli_thread_ocomm_id( &thread_ic ),
899900
&thread->comm[jc_work_id]
900901
);
901-
902902
if ( mtag_b == PACK )
903903
{
904904
cvt_b_buffer_bf16_f32 =
@@ -914,6 +914,7 @@ LPGEMM_5LOOP_AVX2(bfloat16,bfloat16,float,bf16bf16f32of32)
914914
&thread_ic, nc0, NR, FALSE,
915915
&jc_packb_start, &jc_packb_end
916916
);
917+
917918
// Ensure thread ranges are valid, especially cases where no:
918919
// of threads available for parallelization are greater than
919920
// no: of B panel NR chunks.
@@ -1014,7 +1015,6 @@ LPGEMM_5LOOP_AVX2(bfloat16,bfloat16,float,bf16bf16f32of32)
10141015
mem_a_size_req, BLIS_BUFFER_FOR_GEN_USE,
10151016
&mem_a, rntm
10161017
);
1017-
10181018
// For packed or unpacked A matrix, the mc0 * kc0 block is
10191019
//converted to F32, i.e., packing has to be done by default
10201020
cvt_a_buffer_bf16_f32 =

kernels/zen/lpgemm/f32f32f32/lpgemm_cvt_pack_bf16_f32_axv2.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -669,7 +669,7 @@ void cvt_bf16_f32_col_major
669669
SHUFFLE_8x8_AVX2
670670
PERMUTE_8x8_AVX2
671671
GET_STORE_MASK(4, store_mask);
672-
MASKED_STORE_2COLS_AVX2(store_mask);
672+
_mm256_maskstore_ps( ( cvt_buffer + ( ( ic + 0 ) * rs_p ) + kr ), store_mask, b_reg[0] ); \
673673
}
674674
for( ; ( kr + 1 ) < KC; kr += 2 )
675675
{
@@ -683,7 +683,7 @@ void cvt_bf16_f32_col_major
683683
SHUFFLE_8x8_AVX2
684684
PERMUTE_8x8_AVX2
685685
GET_STORE_MASK(2, store_mask);
686-
MASKED_STORE_2COLS_AVX2(store_mask);
686+
_mm256_maskstore_ps( ( cvt_buffer + ( ( ic + 0 ) * rs_p ) + kr ), store_mask, b_reg[0] );
687687
}
688688
for( ; kr < KC; kr += 1 )
689689
{
@@ -695,7 +695,7 @@ void cvt_bf16_f32_col_major
695695
SHUFFLE_8x8_AVX2
696696
PERMUTE_8x8_AVX2
697697
GET_STORE_MASK(1, store_mask);
698-
MASKED_STORE_2COLS_AVX2(store_mask);
698+
_mm256_maskstore_ps( ( cvt_buffer + ( ( ic + 0 ) * rs_p ) + kr ), store_mask, b_reg[0] ); \
699699
}
700700
}
701701
}

kernels/zen/lpgemm/f32f32f32/lpgemm_kernel_macros_f32_avx2.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ multiply with Beta, and add to alpha*A*B*/
195195
( \
196196
_mm256_cvtepi16_epi32 \
197197
( \
198-
_mm_load_si128 \
198+
_mm_loadu_si128 \
199199
( \
200200
( __m128i const* )( \
201201
( bfloat16* )post_ops_attr.buf_downscale + \

kernels/zen/lpgemm/f32f32f32/lpgemm_m_kernel_f32_avx2.c

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -76,12 +76,6 @@ LPGEMM_MAIN_KERN(float,float,float,f32f32f32of32_6x16m)
7676
&&POST_OPS_SIGMOID_6x16F
7777
};
7878

79-
80-
81-
82-
83-
84-
8579
uint64_t n_left = n0 % NR; //n0 is expected to be n0<=NR
8680
// First check whether this is a edge case in the n dimension.
8781
// If so, dispatch other 6x?m kernels, as needed.

0 commit comments

Comments
 (0)