Created
March 17, 2021 19:13
-
-
Save Const-me/c7b80e991b93f8a2d9ac6fb0fd9db878 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void matVecMult81( float *pDst, const float *pMat, const float *pVec, size_t nRows = 90000 ) | |
{ | |
// 30 vector registers in total; ARM64 has 32 of them, so we're good. | |
float32x4_t vec0_3, vec4_7, vec8_11, vec12_15, vec16_19, vec20_23, vec24_27, vec28_31, vec32_35, vec36_39, vec40_43, vec44_47, vec48_51, vec52_55, vec56_59, vec60_63, vec64_67, vec68_71, vec72_75, vec76_79, vec80; | |
float32x4_t mat0, mat1, mat2, mat3, mat4; | |
float32x4_t res0, res1, res2, res3; | |
vec80 = mat4 = vdupq_n_f32( 0.0f ); | |
// Load 16 numbers from pVec into 3 vector registers, incrementing the source pointer | |
#define LOAD_VEC_16( v0, v1, v2, v3 ) \ | |
v0 = vld1q_f32( pVec ); pVec += 4; \ | |
v1 = vld1q_f32( pVec ); pVec += 4; \ | |
v2 = vld1q_f32( pVec ); pVec += 4; \ | |
v3 = vld1q_f32( pVec ); pVec += 4 | |
// Load the complete vector into registers using the above macro | |
LOAD_VEC_16( vec0_3, vec4_7, vec8_11, vec12_15 ); | |
LOAD_VEC_16( vec16_19, vec20_23, vec24_27, vec28_31 ); | |
LOAD_VEC_16( vec32_35, vec36_39, vec40_43, vec44_47 ); | |
LOAD_VEC_16( vec48_51, vec52_55, vec56_59, vec60_63 ); | |
LOAD_VEC_16( vec64_67, vec68_71, vec72_75, vec76_79 ); | |
// Load the fonal scalar of the vector | |
vld1q_lane_f32( pVec, vec80, 0 ); | |
#undef LOAD_VEC_16 | |
// Load 16 numbers from pMat into mat0 - mat3, incrementing the source pointer | |
#define LOAD_MATRIX_16() \ | |
mat0 = vld1q_f32( pMat ); pMat += 4; \ | |
mat1 = vld1q_f32( pMat ); pMat += 4; \ | |
mat2 = vld1q_f32( pMat ); pMat += 4; \ | |
mat3 = vld1q_f32( pMat ); pMat += 4 | |
// Multiply 16 numbers in mat0 - mat3 by the specified pieces of the vector, and accumulate into res0 - res3 | |
// Multiple accumulators is critical for performance, 4 instructions produced by this macro don't have data dependencies between them. | |
#define HANDLE_BLOCK_16( v0, v1, v2, v3 ) \ | |
res0 = vfmaq_f32( res0, mat0, v0 ); \ | |
res1 = vfmaq_f32( res1, mat1, v1 ); \ | |
res2 = vfmaq_f32( res2, mat2, v2 ); \ | |
res3 = vfmaq_f32( res3, mat3, v3 ) | |
const float* const pMatEnd = pMat + nRows * 81; | |
while( pMat < pMatEnd ) | |
{ | |
// Initial 16 elements only need multiplications. | |
LOAD_MATRIX_16(); | |
res0 = vmulq_f32( mat0, vec0_3 ); | |
res1 = vmulq_f32( mat1, vec4_7 ); | |
res2 = vmulq_f32( mat2, vec8_11 ); | |
res3 = vmulq_f32( mat3, vec12_15 ); | |
// Handle the rest of the row using FMA instructions. | |
LOAD_MATRIX_16(); | |
HANDLE_BLOCK_16( vec16_19, vec20_23, vec24_27, vec28_31 ); | |
LOAD_MATRIX_16(); | |
HANDLE_BLOCK_16( vec32_35, vec36_39, vec40_43, vec44_47 ); | |
LOAD_MATRIX_16(); | |
HANDLE_BLOCK_16( vec48_51, vec52_55, vec56_59, vec60_63 ); | |
// The final block of the row has 17 scalars instead of 16 | |
LOAD_MATRIX_16(); | |
vld1q_lane_f32( pMat, mat4, 0 ); pMat++; | |
HANDLE_BLOCK_16( vec64_67, vec68_71, vec72_75, vec76_79 ); | |
res0 = vfmaq_f32( res0, mat4, vec80 ); | |
// Vertically add 4 accumulators into res0 | |
res1 = vaddq_f32( res1, res2 ); | |
res0 = vaddq_f32( res3, res0 ); | |
res0 = vaddq_f32( res1, res0 ); | |
// Store the horizontal sum of the accumulator | |
*pDst = vaddvq_f32( res0 ); | |
pDst++; | |
} | |
#undef LOAD_MATRIX_16 | |
#undef HANDLE_BLOCK_16 | |
} |
@ObjectDetectADAS 1000 floats won’t fit in NEON registers. Just use Eigen https://eigen.tuxfamily.org/
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
any idea to do the same implement for 1000 x 1000 k ?