0x1F9F1/neon.c

## neon.c
void SDL_Convert_S8_to_F32_NEON(const Sint8* src, float* dst)
{
    uint8x16_t flipper = vdupq_n_u8(0x80);
    uint32x4_t caster = vdupq_n_u32(0x47800000u);
    float32x4_t offset = vdupq_n_f32(-65537.0f);

    uint8x16_t bytes = veorq_u8(vld1q_u8((const uint8_t*) src), flipper);

    uint16x8_t shorts1 = vmovl_u8(vget_low_u8(bytes));
    uint16x8_t shorts2 = vmovl_u8(vget_high_u8(bytes));

    float32x4_t floats1 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster,  vget_low_u16(shorts1))), offset);
    float32x4_t floats2 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster, vget_high_u16(shorts1))), offset);
    float32x4_t floats3 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster,  vget_low_u16(shorts2))), offset);
    float32x4_t floats4 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster, vget_high_u16(shorts2))), offset);

    vst1q_f32(&dst[0], floats1);
    vst1q_f32(&dst[4], floats2);
    vst1q_f32(&dst[8], floats3);
    vst1q_f32(&dst[12], floats4);
}

void SDL_Convert_U8_to_F32_NEON(const Uint8* src, float* dst)
{
    uint32x4_t caster = vdupq_n_u32(0x47800000u);
    float32x4_t offset = vdupq_n_f32(-65537.0f);

    uint8x16_t bytes = vld1q_u8((const uint8_t*) src);

    uint16x8_t shorts1 = vmovl_u8(vget_low_u8(bytes));
    uint16x8_t shorts2 = vmovl_u8(vget_high_u8(bytes));

    float32x4_t floats1 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster,  vget_low_u16(shorts1))), offset);
    float32x4_t floats2 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster, vget_high_u16(shorts1))), offset);
    float32x4_t floats3 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster,  vget_low_u16(shorts2))), offset);
    float32x4_t floats4 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster, vget_high_u16(shorts2))), offset);

    vst1q_f32(&dst[0], floats1);
    vst1q_f32(&dst[4], floats2);
    vst1q_f32(&dst[8], floats3);
    vst1q_f32(&dst[12], floats4);
}

void SDL_Convert_S16_to_F32_NEON(const Sint16* src, float* dst)
{
    uint16x8_t flipper = vdupq_n_u16(0x8000);
    uint32x4_t caster = vdupq_n_u32(0x43800000);
    float32x4_t offset = vdupq_n_f32(-257.0f);

    uint16x8_t shorts1 = veorq_u16(vld1q_u16((const uint16_t*) &src[0]), flipper);
    uint16x8_t shorts2 = veorq_u16(vld1q_u16((const uint16_t*) &src[8]), flipper);

    float32x4_t floats1 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster,  vget_low_u16(shorts1))), offset);
    float32x4_t floats2 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster, vget_high_u16(shorts1))), offset);
    float32x4_t floats3 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster,  vget_low_u16(shorts2))), offset);
    float32x4_t floats4 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster, vget_high_u16(shorts2))), offset);

    vst1q_f32(&dst[0], floats1);
    vst1q_f32(&dst[4], floats2);
    vst1q_f32(&dst[8], floats3);
    vst1q_f32(&dst[12], floats4);
}

void SDL_Convert_U16_to_F32_NEON(const Uint16* src, float* dst)
{
    uint32x4_t caster = vdupq_n_u32(0x43800000);
    float32x4_t offset = vdupq_n_f32(-257.0f);

    uint16x8_t shorts1 = vld1q_u16((const uint16_t*) &src[0]);
    uint16x8_t shorts2 = vld1q_u16((const uint16_t*) &src[8]);

    float32x4_t floats1 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster,  vget_low_u16(shorts1))), offset);
    float32x4_t floats2 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster, vget_high_u16(shorts1))), offset);
    float32x4_t floats3 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster,  vget_low_u16(shorts2))), offset);
    float32x4_t floats4 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster, vget_high_u16(shorts2))), offset);

    vst1q_f32(&dst[0], floats1);
    vst1q_f32(&dst[4], floats2);
    vst1q_f32(&dst[8], floats3);
    vst1q_f32(&dst[12], floats4);
}
	void SDL_Convert_S8_to_F32_NEON(const Sint8* src, float* dst)
	{
	uint8x16_t flipper = vdupq_n_u8(0x80);
	uint32x4_t caster = vdupq_n_u32(0x47800000u);
	float32x4_t offset = vdupq_n_f32(-65537.0f);

	uint8x16_t bytes = veorq_u8(vld1q_u8((const uint8_t*) src), flipper);

	uint16x8_t shorts1 = vmovl_u8(vget_low_u8(bytes));
	uint16x8_t shorts2 = vmovl_u8(vget_high_u8(bytes));

	float32x4_t floats1 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster, vget_low_u16(shorts1))), offset);
	float32x4_t floats2 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster, vget_high_u16(shorts1))), offset);
	float32x4_t floats3 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster, vget_low_u16(shorts2))), offset);
	float32x4_t floats4 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster, vget_high_u16(shorts2))), offset);

	vst1q_f32(&dst[0], floats1);
	vst1q_f32(&dst[4], floats2);
	vst1q_f32(&dst[8], floats3);
	vst1q_f32(&dst[12], floats4);
	}

	void SDL_Convert_U8_to_F32_NEON(const Uint8* src, float* dst)
	{
	uint32x4_t caster = vdupq_n_u32(0x47800000u);
	float32x4_t offset = vdupq_n_f32(-65537.0f);

	uint8x16_t bytes = vld1q_u8((const uint8_t*) src);

	uint16x8_t shorts1 = vmovl_u8(vget_low_u8(bytes));
	uint16x8_t shorts2 = vmovl_u8(vget_high_u8(bytes));

	float32x4_t floats1 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster, vget_low_u16(shorts1))), offset);
	float32x4_t floats2 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster, vget_high_u16(shorts1))), offset);
	float32x4_t floats3 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster, vget_low_u16(shorts2))), offset);
	float32x4_t floats4 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster, vget_high_u16(shorts2))), offset);

	vst1q_f32(&dst[0], floats1);
	vst1q_f32(&dst[4], floats2);
	vst1q_f32(&dst[8], floats3);
	vst1q_f32(&dst[12], floats4);
	}

	void SDL_Convert_S16_to_F32_NEON(const Sint16* src, float* dst)
	{
	uint16x8_t flipper = vdupq_n_u16(0x8000);
	uint32x4_t caster = vdupq_n_u32(0x43800000);
	float32x4_t offset = vdupq_n_f32(-257.0f);

	uint16x8_t shorts1 = veorq_u16(vld1q_u16((const uint16_t*) &src[0]), flipper);
	uint16x8_t shorts2 = veorq_u16(vld1q_u16((const uint16_t*) &src[8]), flipper);

	float32x4_t floats1 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster, vget_low_u16(shorts1))), offset);
	float32x4_t floats2 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster, vget_high_u16(shorts1))), offset);
	float32x4_t floats3 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster, vget_low_u16(shorts2))), offset);
	float32x4_t floats4 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster, vget_high_u16(shorts2))), offset);

	vst1q_f32(&dst[0], floats1);
	vst1q_f32(&dst[4], floats2);
	vst1q_f32(&dst[8], floats3);
	vst1q_f32(&dst[12], floats4);
	}

	void SDL_Convert_U16_to_F32_NEON(const Uint16* src, float* dst)
	{
	uint32x4_t caster = vdupq_n_u32(0x43800000);
	float32x4_t offset = vdupq_n_f32(-257.0f);

	uint16x8_t shorts1 = vld1q_u16((const uint16_t*) &src[0]);
	uint16x8_t shorts2 = vld1q_u16((const uint16_t*) &src[8]);

	float32x4_t floats1 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster, vget_low_u16(shorts1))), offset);
	float32x4_t floats2 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster, vget_high_u16(shorts1))), offset);
	float32x4_t floats3 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster, vget_low_u16(shorts2))), offset);
	float32x4_t floats4 = vaddq_f32(vreinterpretq_f32_u32(vaddw_u16(caster, vget_high_u16(shorts2))), offset);

	vst1q_f32(&dst[0], floats1);
	vst1q_f32(&dst[4], floats2);
	vst1q_f32(&dst[8], floats3);
	vst1q_f32(&dst[12], floats4);
	}