AliBarber/short2half.kernel.c

## short2half.kernel.c
unsigned char constant BitReverseTable256[256]  =
{
#   define R2(n)     n,     n + 2*64,     n + 1*64,     n + 3*64
#   define R4(n) R2(n), R2(n + 2*16), R2(n + 1*16), R2(n + 3*16)
#   define R6(n) R4(n), R4(n + 2*4 ), R4(n + 1*4 ), R4(n + 3*4 )
    R6(0), R6(2), R6(1), R6(3)
};

int constant MultiplyDeBruijnBitPosition[32] =
{
  0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
  31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
};

void kernel short2half(global const short* data_in, global unsigned short* data_out)
{
	const int thread_id = get_global_id(0);
	const unsigned short sign_mask = data_in[thread_id] & 0x8000;

	unsigned short significand = (unsigned short) abs(data_in[thread_id]);

	const reverse_value =
		(BitReverseTable256[significand & 0xff] << 8) |
    	(BitReverseTable256[(significand >> 8) & 0xff]);

    const int bit_position = MultiplyDeBruijnBitPosition[((unsigned int)((reverse_value & -reverse_value) * 0x077CB531U)) >> 27];

    const int shifts = 10 - abs(15 - bit_position); // Going from the other end now.
    significand <<= shifts;
    significand &= 0x3FF;

    unsigned short exponent = 15 + 10 - shifts;

    data_out[thread_id] = ((unsigned short) (exponent << 10) | significand) | sign_mask;

}
	unsigned char constant BitReverseTable256[256] =
	{
	# define R2(n) n, n + 264, n + 164, n + 3*64
	# define R4(n) R2(n), R2(n + 216), R2(n + 116), R2(n + 3*16)
	# define R6(n) R4(n), R4(n + 24 ), R4(n + 14 ), R4(n + 3*4 )
	R6(0), R6(2), R6(1), R6(3)
	};

	int constant MultiplyDeBruijnBitPosition[32] =
	{
	0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
	31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
	};

	void kernel short2half(global const short* data_in, global unsigned short* data_out)
	{
	const int thread_id = get_global_id(0);
	const unsigned short sign_mask = data_in[thread_id] & 0x8000;

	unsigned short significand = (unsigned short) abs(data_in[thread_id]);

	const reverse_value =
	(BitReverseTable256[significand & 0xff] << 8) \|
	(BitReverseTable256[(significand >> 8) & 0xff]);

	const int bit_position = MultiplyDeBruijnBitPosition[((unsigned int)((reverse_value & -reverse_value) * 0x077CB531U)) >> 27];

	const int shifts = 10 - abs(15 - bit_position); // Going from the other end now.
	significand <<= shifts;
	significand &= 0x3FF;

	unsigned short exponent = 15 + 10 - shifts;

	data_out[thread_id] = ((unsigned short) (exponent << 10) \| significand) \| sign_mask;

	}