Const-me/matVecMult81.cpp

## matVecMult81.cpp
void matVecMult81( float *pDst, const float *pMat, const float *pVec, size_t nRows = 90000 )
{
	// 30 vector registers in total; ARM64 has 32 of them, so we're good.
	float32x4_t vec0_3, vec4_7, vec8_11, vec12_15, vec16_19, vec20_23, vec24_27, vec28_31, vec32_35, vec36_39, vec40_43, vec44_47, vec48_51, vec52_55, vec56_59, vec60_63, vec64_67, vec68_71, vec72_75, vec76_79, vec80;
	float32x4_t mat0, mat1, mat2, mat3, mat4;
	float32x4_t res0, res1, res2, res3;

	vec80 = mat4 = vdupq_n_f32( 0.0f );
	// Load 16 numbers from pVec into 3 vector registers, incrementing the source pointer
#define LOAD_VEC_16( v0, v1, v2, v3 )      \
	v0 = vld1q_f32( pVec ); pVec += 4;     \
	v1 = vld1q_f32( pVec ); pVec += 4;     \
	v2 = vld1q_f32( pVec ); pVec += 4;     \
	v3 = vld1q_f32( pVec ); pVec += 4

	// Load the complete vector into registers using the above macro
	LOAD_VEC_16( vec0_3, vec4_7, vec8_11, vec12_15 );
	LOAD_VEC_16( vec16_19, vec20_23, vec24_27, vec28_31 );
	LOAD_VEC_16( vec32_35, vec36_39, vec40_43, vec44_47 );
	LOAD_VEC_16( vec48_51, vec52_55, vec56_59, vec60_63 );
	LOAD_VEC_16( vec64_67, vec68_71, vec72_75, vec76_79 );
	// Load the fonal scalar of the vector
	vld1q_lane_f32( pVec, vec80, 0 );

#undef LOAD_VEC_16

	// Load 16 numbers from pMat into mat0 - mat3, incrementing the source pointer
#define LOAD_MATRIX_16()                         \
		mat0 = vld1q_f32( pMat ); pMat += 4;     \
		mat1 = vld1q_f32( pMat ); pMat += 4;     \
		mat2 = vld1q_f32( pMat ); pMat += 4;     \
		mat3 = vld1q_f32( pMat ); pMat += 4

	// Multiply 16 numbers in mat0 - mat3 by the specified pieces of the vector, and accumulate into res0 - res3
	// Multiple accumulators is critical for performance, 4 instructions produced by this macro don't have data dependencies between them.
#define HANDLE_BLOCK_16( v0, v1, v2, v3 )        \
		res0 = vfmaq_f32( res0, mat0, v0 );      \
		res1 = vfmaq_f32( res1, mat1, v1 );      \
		res2 = vfmaq_f32( res2, mat2, v2 );      \
		res3 = vfmaq_f32( res3, mat3, v3 )

	const float* const pMatEnd = pMat + nRows * 81;
	while( pMat < pMatEnd )
	{
		// Initial 16 elements only need multiplications.
		LOAD_MATRIX_16();
		res0 = vmulq_f32( mat0, vec0_3 );
		res1 = vmulq_f32( mat1, vec4_7 );
		res2 = vmulq_f32( mat2, vec8_11 );
		res3 = vmulq_f32( mat3, vec12_15 );

		// Handle the rest of the row using FMA instructions.
		LOAD_MATRIX_16();
		HANDLE_BLOCK_16( vec16_19, vec20_23, vec24_27, vec28_31 );

		LOAD_MATRIX_16();
		HANDLE_BLOCK_16( vec32_35, vec36_39, vec40_43, vec44_47 );

		LOAD_MATRIX_16();
		HANDLE_BLOCK_16( vec48_51, vec52_55, vec56_59, vec60_63 );

		// The final block of the row has 17 scalars instead of 16
		LOAD_MATRIX_16();
		vld1q_lane_f32( pMat, mat4, 0 ); pMat++;

		HANDLE_BLOCK_16( vec64_67, vec68_71, vec72_75, vec76_79 );
		res0 = vfmaq_f32( res0, mat4, vec80 );

		// Vertically add 4 accumulators into res0
		res1 = vaddq_f32( res1, res2 );
		res0 = vaddq_f32( res3, res0 );
		res0 = vaddq_f32( res1, res0 );

		// Store the horizontal sum of the accumulator
		*pDst = vaddvq_f32( res0 );
		pDst++;
	}

#undef LOAD_MATRIX_16
#undef HANDLE_BLOCK_16
}
	void matVecMult81( float pDst, const float pMat, const float *pVec, size_t nRows = 90000 )
	{
	// 30 vector registers in total; ARM64 has 32 of them, so we're good.
	float32x4_t vec0_3, vec4_7, vec8_11, vec12_15, vec16_19, vec20_23, vec24_27, vec28_31, vec32_35, vec36_39, vec40_43, vec44_47, vec48_51, vec52_55, vec56_59, vec60_63, vec64_67, vec68_71, vec72_75, vec76_79, vec80;
	float32x4_t mat0, mat1, mat2, mat3, mat4;
	float32x4_t res0, res1, res2, res3;

	vec80 = mat4 = vdupq_n_f32( 0.0f );
	// Load 16 numbers from pVec into 3 vector registers, incrementing the source pointer
	#define LOAD_VEC_16( v0, v1, v2, v3 ) \
	v0 = vld1q_f32( pVec ); pVec += 4; \
	v1 = vld1q_f32( pVec ); pVec += 4; \
	v2 = vld1q_f32( pVec ); pVec += 4; \
	v3 = vld1q_f32( pVec ); pVec += 4

	// Load the complete vector into registers using the above macro
	LOAD_VEC_16( vec0_3, vec4_7, vec8_11, vec12_15 );
	LOAD_VEC_16( vec16_19, vec20_23, vec24_27, vec28_31 );
	LOAD_VEC_16( vec32_35, vec36_39, vec40_43, vec44_47 );
	LOAD_VEC_16( vec48_51, vec52_55, vec56_59, vec60_63 );
	LOAD_VEC_16( vec64_67, vec68_71, vec72_75, vec76_79 );
	// Load the fonal scalar of the vector
	vld1q_lane_f32( pVec, vec80, 0 );

	#undef LOAD_VEC_16

	// Load 16 numbers from pMat into mat0 - mat3, incrementing the source pointer
	#define LOAD_MATRIX_16() \
	mat0 = vld1q_f32( pMat ); pMat += 4; \
	mat1 = vld1q_f32( pMat ); pMat += 4; \
	mat2 = vld1q_f32( pMat ); pMat += 4; \
	mat3 = vld1q_f32( pMat ); pMat += 4

	// Multiply 16 numbers in mat0 - mat3 by the specified pieces of the vector, and accumulate into res0 - res3
	// Multiple accumulators is critical for performance, 4 instructions produced by this macro don't have data dependencies between them.
	#define HANDLE_BLOCK_16( v0, v1, v2, v3 ) \
	res0 = vfmaq_f32( res0, mat0, v0 ); \
	res1 = vfmaq_f32( res1, mat1, v1 ); \
	res2 = vfmaq_f32( res2, mat2, v2 ); \
	res3 = vfmaq_f32( res3, mat3, v3 )

	const float* const pMatEnd = pMat + nRows * 81;
	while( pMat < pMatEnd )
	{
	// Initial 16 elements only need multiplications.
	LOAD_MATRIX_16();
	res0 = vmulq_f32( mat0, vec0_3 );
	res1 = vmulq_f32( mat1, vec4_7 );
	res2 = vmulq_f32( mat2, vec8_11 );
	res3 = vmulq_f32( mat3, vec12_15 );

	// Handle the rest of the row using FMA instructions.
	LOAD_MATRIX_16();
	HANDLE_BLOCK_16( vec16_19, vec20_23, vec24_27, vec28_31 );

	LOAD_MATRIX_16();
	HANDLE_BLOCK_16( vec32_35, vec36_39, vec40_43, vec44_47 );

	LOAD_MATRIX_16();
	HANDLE_BLOCK_16( vec48_51, vec52_55, vec56_59, vec60_63 );

	// The final block of the row has 17 scalars instead of 16
	LOAD_MATRIX_16();
	vld1q_lane_f32( pMat, mat4, 0 ); pMat++;

	HANDLE_BLOCK_16( vec64_67, vec68_71, vec72_75, vec76_79 );
	res0 = vfmaq_f32( res0, mat4, vec80 );

	// Vertically add 4 accumulators into res0
	res1 = vaddq_f32( res1, res2 );
	res0 = vaddq_f32( res3, res0 );
	res0 = vaddq_f32( res1, res0 );

	// Store the horizontal sum of the accumulator
	*pDst = vaddvq_f32( res0 );
	pDst++;
	}

	#undef LOAD_MATRIX_16
	#undef HANDLE_BLOCK_16
	}