isilence/Gramian_matrix.cl

## Gramian_matrix.cl
// preferred device arithmetic vector width
#if !defined(DLM_ARCH_VECSZ_CHAR)
    #define DLM_ARCH_VECSZ_CHAR 4
#endif
#if !defined(DLM_ARCH_VECSZ_SHORT)
    #define DLM_ARCH_VECSZ_SHORT 1
#endif
#if !defined(DLM_ARCH_VECSZ_INT)
    #define DLM_ARCH_VECSZ_INT 1
#endif
#if !defined(DLM_ARCH_VECSZ_LONG)
    #define DLM_ARCH_VECSZ_LONG 1
#endif
#if !defined(DLM_ARCH_VECSZ_FLOAT)
    #define DLM_ARCH_VECSZ_FLOAT 1
#endif
#if !defined(DLM_ARCH_VECSZ_DOUBLE)
    #define DLM_ARCH_VECSZ_DOUBLE 1
#endif

// useful macroses
#define DLM_QUOTE_(str) #str
#define DLM_QUOTE(str) DLM_QUOTE_(str)
#define DLM_PRIMITIVE_CONCAT(str1, str2) str1 ## str2
#define DLM_CONCAT(str1, str2) DLM_PRIMITIVE_CONCAT(str1, str2)

// compiler hints
#if  __OPENCL_VERSION__ >= 210
    #define DLM_UNROLL __attribute__((opencl_unroll_hint))
    #define DLM_UNROLLN(n) __attribute__((opencl_unroll_hint( n )))
    #define DLM_NOSVM __attribute__((nosvm))
#else
    #define DLM_UNROLL
    #define DLM_UNROLLN(n)
    #define DLM_NOSVM
#endif

#if defined(DLM_COMPILE_WITHOUT_SVM)
    #define DLM_STAT_SVM DLM_NO_SVM
#else
    #define DLM_STAT_SVM
#endif


// datatype generalization
typedef char char1;
typedef uchar uchar1;
typedef short short1;
typedef ushort ushort1;
typedef int int1;
typedef uint uint1;
typedef long long1;
typedef ulong ulong1;
typedef float float1;
#if defined(cl_khr_fp64)
    typedef double double1;
#endif


// memory operations generalizaiton
#define vload1(offset, ptr) ((ptr)[offset])
#define vloadN(N, offset, ptr) DLM_CONCAT(vload, N)((offset), (ptr))
#define vloadSN(N, ptr) vloadN(N, 0, ptr)

#define vstore1(data, offset, ptr) (ptr)[offset] = (data)
#define vstoreN(N, data, offset, ptr) DLM_CONCAT(vstore, N)((data), (offset), (ptr))
#define vstoreSN(N, data, ptr) vstoreN(N, data, 0, ptr)

#define vtransferN(N, loadOffset, loadPtr, storeOffset, storePtr) \
    vstoreN(N, \
            vloadN(N, (loadOffset), (loadPtr)), \
            (storeOffset), (storePtr) \
    )
#define vtransferSN(N, loadPtr, storePtr) vtransferN(N, 0, loadPtr, 0, storePtr)

// work-group foos
#if (__OPENCL_VERSION__ - 0 < 210)
    inline size_t dlm_loc_lin_id(void) {
        return get_local_id(1) * get_local_size(0) + get_local_id(0);
    }
    inline size_t dlm_glob_lin_id(void) {
        return get_global_id(1) * get_global_size(0) + get_global_id(0);
    }
#else
    inline size_t dlm_loc_lin_id(void) {
        return get_local_linear_id();
    }
    inline size_t dlm_glob_lin_id(void) {
        return get_global_linear_id();
    }
#endif
inline size_t dlm_group_offset(const uint dimindx) {
    return get_group_id(dimindx) * get_local_size(dimindx);
}

// vector type folding
inline float sumVec1(const float v) {
    return v;
}
inline float sumVec2(const float2 v) {
    return v.x + v.y;
}
inline float sumVec4(const float4 v) {
    return dot(v, (float4)1.0f);
}
inline float sumVec8(const float8 v) {
    return sumVec4(v.s0123 + v.s4567);
}
#define sumVecN(N, v) DLM_CONCAT(sumVec, N)((v))


// ================================================
//        gram preferences

#if !defined(BLOCK_SIDEX)
    #define BLOCK_SIDEX 8
#endif
#if !defined(BLOCK_SIDEY)
    #define BLOCK_SIDEY 8
#endif
#if (BLOCK_SIDEY == BLOCK_SIDEX)
    #define BLOCK_SIDE BLOCK_SIDEX
#endif

#define N DLM_ARCH_VECSZ_FLOAT
#define M 1
#define BLOCK_SIZE (BLOCK_SIDEX * BLOCK_SIDEY)
#define DATA_WIDTH (M * BLOCK_SIZE)

typedef DLM_CONCAT(float, N) floatN;


// Gram matrix computation block algorithm
// group size: (BLOCK_SIDEX, BLOCK_SIDEY, 1)
// global size: should match with gram matrix size (n * n)
//
// params:
// n - implicit argument, equal get_global_id(i), i = {1,2}.
//     number of vectors in input array & size of gram matrix
// vs - array of input vectors (n * vecSz). Vectors stored in memory sequentially
// output - gram matrix (n * n)
// vecSz - length of each input vector
//
//    algorithm basics
// 1. each work-item computes appropriate element of gram matrix,
// 2. each group computes [BLOCK_SIDEX * BLOCK_SIDEY] block of gram matrix,
//    starts with group_global_offset (see dlm_group_offset) gram's element in each dimension
// 3. Outer loop:
//    work-item computes dot product of [DATA_WIDTH] elements each iteration
// 4. First inner loop:
//    whole group reads [DATA_WIDTH] chunk of vector each iteration.
//    After we have [BLOCK_SIDEX] & [BLOCK_SIDEY] chunks of vectors in local memory
//

__attribute__((vec_type_hint(floatN)))
__attribute__((work_group_size_hint(BLOCK_SIDEX, BLOCK_SIDEY, 1)))
__kernel void computeGramMrx(
    DLM_STAT_SVM __global const float * __restrict const vs,
    DLM_STAT_SVM __global float * __restrict const output,
    const int vecSz)
{
    __local float xss[BLOCK_SIDEX][DATA_WIDTH + 1]; // avoid bank conflicts!
    __local float yss[BLOCK_SIDEY][DATA_WIDTH + 1];

    const size_t localId = dlm_loc_lin_id();
    const size_t xBlockShift = dlm_group_offset(0) * vecSz + M * localId;
    const size_t yBlockShift = dlm_group_offset(1) * vecSz + M * localId;

    floatN s = (floatN) 0.0f;
    for (size_t layer=0; layer < vecSz; layer += DATA_WIDTH) {
        __global const float * layeredVs = &vs[layer];
        for (int i=0; i < BLOCK_SIDE; ++i, layeredVs += vecSz) {
            vtransferSN( M, &layeredVs[xBlockShift], &xss[i][localId * M] );
            vtransferSN( M, &layeredVs[yBlockShift], &yss[i][localId * M] );
        }
        barrier(CLK_LOCAL_MEM_FENCE);

        for (int i=0; i < DATA_WIDTH; i += vec_step(s)) {
            const floatN x = vloadSN( N, &xss[get_local_id(0)][i] );
            const floatN y = vloadSN( N, &yss[get_local_id(1)][i] );
            s += x * y; //mad(x,y,s);
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    output[dlm_glob_lin_id()] = sumVecN(N, s);
}
	// preferred device arithmetic vector width
	#if !defined(DLM_ARCH_VECSZ_CHAR)
	#define DLM_ARCH_VECSZ_CHAR 4
	#endif
	#if !defined(DLM_ARCH_VECSZ_SHORT)
	#define DLM_ARCH_VECSZ_SHORT 1
	#endif
	#if !defined(DLM_ARCH_VECSZ_INT)
	#define DLM_ARCH_VECSZ_INT 1
	#endif
	#if !defined(DLM_ARCH_VECSZ_LONG)
	#define DLM_ARCH_VECSZ_LONG 1
	#endif
	#if !defined(DLM_ARCH_VECSZ_FLOAT)
	#define DLM_ARCH_VECSZ_FLOAT 1
	#endif
	#if !defined(DLM_ARCH_VECSZ_DOUBLE)
	#define DLM_ARCH_VECSZ_DOUBLE 1
	#endif

	// useful macroses
	#define DLM_QUOTE_(str) #str
	#define DLM_QUOTE(str) DLM_QUOTE_(str)
	#define DLM_PRIMITIVE_CONCAT(str1, str2) str1 ## str2
	#define DLM_CONCAT(str1, str2) DLM_PRIMITIVE_CONCAT(str1, str2)

	// compiler hints
	#if __OPENCL_VERSION__ >= 210
	#define DLM_UNROLL __attribute__((opencl_unroll_hint))
	#define DLM_UNROLLN(n) __attribute__((opencl_unroll_hint( n )))
	#define DLM_NOSVM __attribute__((nosvm))
	#else
	#define DLM_UNROLL
	#define DLM_UNROLLN(n)
	#define DLM_NOSVM
	#endif

	#if defined(DLM_COMPILE_WITHOUT_SVM)
	#define DLM_STAT_SVM DLM_NO_SVM
	#else
	#define DLM_STAT_SVM
	#endif


	// datatype generalization
	typedef char char1;
	typedef uchar uchar1;
	typedef short short1;
	typedef ushort ushort1;
	typedef int int1;
	typedef uint uint1;
	typedef long long1;
	typedef ulong ulong1;
	typedef float float1;
	#if defined(cl_khr_fp64)
	typedef double double1;
	#endif


	// memory operations generalizaiton
	#define vload1(offset, ptr) ((ptr)[offset])
	#define vloadN(N, offset, ptr) DLM_CONCAT(vload, N)((offset), (ptr))
	#define vloadSN(N, ptr) vloadN(N, 0, ptr)

	#define vstore1(data, offset, ptr) (ptr)[offset] = (data)
	#define vstoreN(N, data, offset, ptr) DLM_CONCAT(vstore, N)((data), (offset), (ptr))
	#define vstoreSN(N, data, ptr) vstoreN(N, data, 0, ptr)

	#define vtransferN(N, loadOffset, loadPtr, storeOffset, storePtr) \
	vstoreN(N, \
	vloadN(N, (loadOffset), (loadPtr)), \
	(storeOffset), (storePtr) \
	)
	#define vtransferSN(N, loadPtr, storePtr) vtransferN(N, 0, loadPtr, 0, storePtr)

	// work-group foos
	#if (__OPENCL_VERSION__ - 0 < 210)
	inline size_t dlm_loc_lin_id(void) {
	return get_local_id(1) * get_local_size(0) + get_local_id(0);
	}
	inline size_t dlm_glob_lin_id(void) {
	return get_global_id(1) * get_global_size(0) + get_global_id(0);
	}
	#else
	inline size_t dlm_loc_lin_id(void) {
	return get_local_linear_id();
	}
	inline size_t dlm_glob_lin_id(void) {
	return get_global_linear_id();
	}
	#endif
	inline size_t dlm_group_offset(const uint dimindx) {
	return get_group_id(dimindx) * get_local_size(dimindx);
	}

	// vector type folding
	inline float sumVec1(const float v) {
	return v;
	}
	inline float sumVec2(const float2 v) {
	return v.x + v.y;
	}
	inline float sumVec4(const float4 v) {
	return dot(v, (float4)1.0f);
	}
	inline float sumVec8(const float8 v) {
	return sumVec4(v.s0123 + v.s4567);
	}
	#define sumVecN(N, v) DLM_CONCAT(sumVec, N)((v))



	// ================================================
	// gram preferences

	#if !defined(BLOCK_SIDEX)
	#define BLOCK_SIDEX 8
	#endif
	#if !defined(BLOCK_SIDEY)
	#define BLOCK_SIDEY 8
	#endif
	#if (BLOCK_SIDEY == BLOCK_SIDEX)
	#define BLOCK_SIDE BLOCK_SIDEX
	#endif

	#define N DLM_ARCH_VECSZ_FLOAT
	#define M 1
	#define BLOCK_SIZE (BLOCK_SIDEX * BLOCK_SIDEY)
	#define DATA_WIDTH (M * BLOCK_SIZE)

	typedef DLM_CONCAT(float, N) floatN;


	// Gram matrix computation block algorithm
	// group size: (BLOCK_SIDEX, BLOCK_SIDEY, 1)
	// global size: should match with gram matrix size (n * n)
	//
	// params:
	// n - implicit argument, equal get_global_id(i), i = {1,2}.
	// number of vectors in input array & size of gram matrix
	// vs - array of input vectors (n * vecSz). Vectors stored in memory sequentially
	// output - gram matrix (n * n)
	// vecSz - length of each input vector
	//
	// algorithm basics
	// 1. each work-item computes appropriate element of gram matrix,
	// 2. each group computes [BLOCK_SIDEX * BLOCK_SIDEY] block of gram matrix,
	// starts with group_global_offset (see dlm_group_offset) gram's element in each dimension
	// 3. Outer loop:
	// work-item computes dot product of [DATA_WIDTH] elements each iteration
	// 4. First inner loop:
	// whole group reads [DATA_WIDTH] chunk of vector each iteration.
	// After we have [BLOCK_SIDEX] & [BLOCK_SIDEY] chunks of vectors in local memory
	//

	__attribute__((vec_type_hint(floatN)))
	__attribute__((work_group_size_hint(BLOCK_SIDEX, BLOCK_SIDEY, 1)))
	__kernel void computeGramMrx(
	DLM_STAT_SVM __global const float * __restrict const vs,
	DLM_STAT_SVM __global float * __restrict const output,
	const int vecSz)
	{
	__local float xss[BLOCK_SIDEX][DATA_WIDTH + 1]; // avoid bank conflicts!
	__local float yss[BLOCK_SIDEY][DATA_WIDTH + 1];

	const size_t localId = dlm_loc_lin_id();
	const size_t xBlockShift = dlm_group_offset(0) * vecSz + M * localId;
	const size_t yBlockShift = dlm_group_offset(1) * vecSz + M * localId;

	floatN s = (floatN) 0.0f;
	for (size_t layer=0; layer < vecSz; layer += DATA_WIDTH) {
	__global const float * layeredVs = &vs[layer];
	for (int i=0; i < BLOCK_SIDE; ++i, layeredVs += vecSz) {
	vtransferSN( M, &layeredVs[xBlockShift], &xss[i][localId * M] );
	vtransferSN( M, &layeredVs[yBlockShift], &yss[i][localId * M] );
	}
	barrier(CLK_LOCAL_MEM_FENCE);

	for (int i=0; i < DATA_WIDTH; i += vec_step(s)) {
	const floatN x = vloadSN( N, &xss[get_local_id(0)][i] );
	const floatN y = vloadSN( N, &yss[get_local_id(1)][i] );
	s += x * y; //mad(x,y,s);
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	}
	output[dlm_glob_lin_id()] = sumVecN(N, s);
	}