Skip to content

Instantly share code, notes, and snippets.

@Const-me
Created March 17, 2021 19:13
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Const-me/c7b80e991b93f8a2d9ac6fb0fd9db878 to your computer and use it in GitHub Desktop.
Save Const-me/c7b80e991b93f8a2d9ac6fb0fd9db878 to your computer and use it in GitHub Desktop.
void matVecMult81( float *pDst, const float *pMat, const float *pVec, size_t nRows = 90000 )
{
// 30 vector registers in total; ARM64 has 32 of them, so we're good.
float32x4_t vec0_3, vec4_7, vec8_11, vec12_15, vec16_19, vec20_23, vec24_27, vec28_31, vec32_35, vec36_39, vec40_43, vec44_47, vec48_51, vec52_55, vec56_59, vec60_63, vec64_67, vec68_71, vec72_75, vec76_79, vec80;
float32x4_t mat0, mat1, mat2, mat3, mat4;
float32x4_t res0, res1, res2, res3;
vec80 = mat4 = vdupq_n_f32( 0.0f );
// Load 16 numbers from pVec into 3 vector registers, incrementing the source pointer
#define LOAD_VEC_16( v0, v1, v2, v3 ) \
v0 = vld1q_f32( pVec ); pVec += 4; \
v1 = vld1q_f32( pVec ); pVec += 4; \
v2 = vld1q_f32( pVec ); pVec += 4; \
v3 = vld1q_f32( pVec ); pVec += 4
// Load the complete vector into registers using the above macro
LOAD_VEC_16( vec0_3, vec4_7, vec8_11, vec12_15 );
LOAD_VEC_16( vec16_19, vec20_23, vec24_27, vec28_31 );
LOAD_VEC_16( vec32_35, vec36_39, vec40_43, vec44_47 );
LOAD_VEC_16( vec48_51, vec52_55, vec56_59, vec60_63 );
LOAD_VEC_16( vec64_67, vec68_71, vec72_75, vec76_79 );
// Load the fonal scalar of the vector
vld1q_lane_f32( pVec, vec80, 0 );
#undef LOAD_VEC_16
// Load 16 numbers from pMat into mat0 - mat3, incrementing the source pointer
#define LOAD_MATRIX_16() \
mat0 = vld1q_f32( pMat ); pMat += 4; \
mat1 = vld1q_f32( pMat ); pMat += 4; \
mat2 = vld1q_f32( pMat ); pMat += 4; \
mat3 = vld1q_f32( pMat ); pMat += 4
// Multiply 16 numbers in mat0 - mat3 by the specified pieces of the vector, and accumulate into res0 - res3
// Multiple accumulators is critical for performance, 4 instructions produced by this macro don't have data dependencies between them.
#define HANDLE_BLOCK_16( v0, v1, v2, v3 ) \
res0 = vfmaq_f32( res0, mat0, v0 ); \
res1 = vfmaq_f32( res1, mat1, v1 ); \
res2 = vfmaq_f32( res2, mat2, v2 ); \
res3 = vfmaq_f32( res3, mat3, v3 )
const float* const pMatEnd = pMat + nRows * 81;
while( pMat < pMatEnd )
{
// Initial 16 elements only need multiplications.
LOAD_MATRIX_16();
res0 = vmulq_f32( mat0, vec0_3 );
res1 = vmulq_f32( mat1, vec4_7 );
res2 = vmulq_f32( mat2, vec8_11 );
res3 = vmulq_f32( mat3, vec12_15 );
// Handle the rest of the row using FMA instructions.
LOAD_MATRIX_16();
HANDLE_BLOCK_16( vec16_19, vec20_23, vec24_27, vec28_31 );
LOAD_MATRIX_16();
HANDLE_BLOCK_16( vec32_35, vec36_39, vec40_43, vec44_47 );
LOAD_MATRIX_16();
HANDLE_BLOCK_16( vec48_51, vec52_55, vec56_59, vec60_63 );
// The final block of the row has 17 scalars instead of 16
LOAD_MATRIX_16();
vld1q_lane_f32( pMat, mat4, 0 ); pMat++;
HANDLE_BLOCK_16( vec64_67, vec68_71, vec72_75, vec76_79 );
res0 = vfmaq_f32( res0, mat4, vec80 );
// Vertically add 4 accumulators into res0
res1 = vaddq_f32( res1, res2 );
res0 = vaddq_f32( res3, res0 );
res0 = vaddq_f32( res1, res0 );
// Store the horizontal sum of the accumulator
*pDst = vaddvq_f32( res0 );
pDst++;
}
#undef LOAD_MATRIX_16
#undef HANDLE_BLOCK_16
}
@ruthreshx
Copy link

any idea to do the same implement for 1000 x 1000 k ?

@Const-me
Copy link
Author

@ObjectDetectADAS 1000 floats won’t fit in NEON registers. Just use Eigen https://eigen.tuxfamily.org/

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment