Skip to content

Commit 55598a9

Browse files
committed
Added row-pref armv8a ukernel from #698.
Details: - Integrated changes from PR #698 to enable testing in the context of the 'stable' branch. These changes add row-preferential sgemm and dgemm microkernels for the armv8a kernel set. - Updated the 'altra' subconfig to easily switch between the previous (column-preferential) ukernel and the aforementioned row-pref ukernel.
1 parent 1232831 commit 55598a9

File tree

3 files changed

+662
-5
lines changed

3 files changed

+662
-5
lines changed

config/altra/bli_cntx_init_altra.c

+20-5
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@
3434

3535
#include "blis.h"
3636

37+
//#define USE_ROWPREF_UKERNEL
38+
3739
void bli_cntx_init_altra( cntx_t* cntx )
3840
{
3941
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
@@ -48,19 +50,32 @@ void bli_cntx_init_altra( cntx_t* cntx )
4850
bli_cntx_set_l3_nat_ukrs
4951
(
5052
2,
53+
#ifdef USE_ROWPREF_UKERNEL
54+
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_12x8r, TRUE,
55+
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_8x6r, TRUE,
56+
#else
5157
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE,
5258
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE,
59+
#endif
5360
cntx
5461
);
5562

5663
// Initialize level-3 blocksize objects with architecture-specific values.
5764
// s d c z
58-
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 6, -1, -1 );
59-
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 8, -1, -1 );
60-
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 192, 120, -1, -1 );
61-
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 480, -1, -1 ); // Changed d to 480 - LDR
62-
// bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 6144, -1, -1 ); // Doubled NC
65+
#ifdef USE_ROWPREF_UKERNEL
66+
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 12, 8, -1, -1 );
67+
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 8, 6, -1, -1 );
68+
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 192, 120, -1, -1 );
69+
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 480, -1, -1 );
70+
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 12288, 8196, -1, -1 );
71+
#else
72+
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 6, -1, -1 );
73+
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 8, -1, -1 );
74+
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 192, 120, -1, -1 );
75+
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 480, -1, -1 ); // Changed d to 480 - LDR
76+
// bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 6144, -1, -1 ); // Doubled NC
6377
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 12288, 8192, -1, -1 ); // Increased NC slightly more
78+
#endif
6479

6580
// Update the context with the current architecture's register and cache
6681
// blocksizes (and multiples) for native execution.

kernels/armv8a/3/armv8a_asm_d2x2.h

+16
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,25 @@
4747
" fmla v"#C0".2d, v"#A".2d, v"#B".d[0] \n\t" \
4848
" fmla v"#C1".2d, v"#A".2d, v"#B".d[1] \n\t"
4949

50+
#define DGEMM_2X2_NANOKERNEL_PLAIN(C0,C1,A,B) \
51+
DGEMM_2X2_NANOKERNEL(C0,C1,A,B)
52+
53+
#define DGEMM_2X2_NANOKERNEL_INIT(C0,C1,A,B) \
54+
" fmul v"#C0".2d, v"#A".2d, v"#B".d[0] \n\t" \
55+
" fmul v"#C1".2d, v"#A".2d, v"#B".d[1] \n\t"
56+
5057
#define SGEMM_4X4_NANOKERNEL(C0,C1,C2,C3,A,B) \
5158
" fmla v"#C0".4s, v"#A".4s, v"#B".s[0] \n\t" \
5259
" fmla v"#C1".4s, v"#A".4s, v"#B".s[1] \n\t" \
5360
" fmla v"#C2".4s, v"#A".4s, v"#B".s[2] \n\t" \
5461
" fmla v"#C3".4s, v"#A".4s, v"#B".s[3] \n\t"
5562

63+
#define SGEMM_4X4_NANOKERNEL_PLAIN(C0,C1,C2,C3,A,B) \
64+
SGEMM_4X4_NANOKERNEL(C0,C1,C2,C3,A,B)
65+
66+
#define SGEMM_4X4_NANOKERNEL_INIT(C0,C1,C2,C3,A,B) \
67+
" fmul v"#C0".4s, v"#A".4s, v"#B".s[0] \n\t" \
68+
" fmul v"#C1".4s, v"#A".4s, v"#B".s[1] \n\t" \
69+
" fmul v"#C2".4s, v"#A".4s, v"#B".s[2] \n\t" \
70+
" fmul v"#C3".4s, v"#A".4s, v"#B".s[3] \n\t"
71+

0 commit comments

Comments
 (0)