Last active
April 6, 2017 21:03
-
-
Save isilence/52b41a53fc0f0cbb538f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// preferred device arithmetic vector width | |
#if !defined(DLM_ARCH_VECSZ_CHAR) | |
#define DLM_ARCH_VECSZ_CHAR 4 | |
#endif | |
#if !defined(DLM_ARCH_VECSZ_SHORT) | |
#define DLM_ARCH_VECSZ_SHORT 1 | |
#endif | |
#if !defined(DLM_ARCH_VECSZ_INT) | |
#define DLM_ARCH_VECSZ_INT 1 | |
#endif | |
#if !defined(DLM_ARCH_VECSZ_LONG) | |
#define DLM_ARCH_VECSZ_LONG 1 | |
#endif | |
#if !defined(DLM_ARCH_VECSZ_FLOAT) | |
#define DLM_ARCH_VECSZ_FLOAT 1 | |
#endif | |
#if !defined(DLM_ARCH_VECSZ_DOUBLE) | |
#define DLM_ARCH_VECSZ_DOUBLE 1 | |
#endif | |
// useful macroses | |
#define DLM_QUOTE_(str) #str | |
#define DLM_QUOTE(str) DLM_QUOTE_(str) | |
#define DLM_PRIMITIVE_CONCAT(str1, str2) str1 ## str2 | |
#define DLM_CONCAT(str1, str2) DLM_PRIMITIVE_CONCAT(str1, str2) | |
// compiler hints | |
#if __OPENCL_VERSION__ >= 210 | |
#define DLM_UNROLL __attribute__((opencl_unroll_hint)) | |
#define DLM_UNROLLN(n) __attribute__((opencl_unroll_hint( n ))) | |
#define DLM_NOSVM __attribute__((nosvm)) | |
#else | |
#define DLM_UNROLL | |
#define DLM_UNROLLN(n) | |
#define DLM_NOSVM | |
#endif | |
#if defined(DLM_COMPILE_WITHOUT_SVM) | |
#define DLM_STAT_SVM DLM_NO_SVM | |
#else | |
#define DLM_STAT_SVM | |
#endif | |
// datatype generalization | |
typedef char char1; | |
typedef uchar uchar1; | |
typedef short short1; | |
typedef ushort ushort1; | |
typedef int int1; | |
typedef uint uint1; | |
typedef long long1; | |
typedef ulong ulong1; | |
typedef float float1; | |
#if defined(cl_khr_fp64) | |
typedef double double1; | |
#endif | |
// memory operations generalizaiton | |
#define vload1(offset, ptr) ((ptr)[offset]) | |
#define vloadN(N, offset, ptr) DLM_CONCAT(vload, N)((offset), (ptr)) | |
#define vloadSN(N, ptr) vloadN(N, 0, ptr) | |
#define vstore1(data, offset, ptr) (ptr)[offset] = (data) | |
#define vstoreN(N, data, offset, ptr) DLM_CONCAT(vstore, N)((data), (offset), (ptr)) | |
#define vstoreSN(N, data, ptr) vstoreN(N, data, 0, ptr) | |
#define vtransferN(N, loadOffset, loadPtr, storeOffset, storePtr) \ | |
vstoreN(N, \ | |
vloadN(N, (loadOffset), (loadPtr)), \ | |
(storeOffset), (storePtr) \ | |
) | |
#define vtransferSN(N, loadPtr, storePtr) vtransferN(N, 0, loadPtr, 0, storePtr) | |
// work-group foos | |
#if (__OPENCL_VERSION__ - 0 < 210) | |
inline size_t dlm_loc_lin_id(void) { | |
return get_local_id(1) * get_local_size(0) + get_local_id(0); | |
} | |
inline size_t dlm_glob_lin_id(void) { | |
return get_global_id(1) * get_global_size(0) + get_global_id(0); | |
} | |
#else | |
inline size_t dlm_loc_lin_id(void) { | |
return get_local_linear_id(); | |
} | |
inline size_t dlm_glob_lin_id(void) { | |
return get_global_linear_id(); | |
} | |
#endif | |
inline size_t dlm_group_offset(const uint dimindx) { | |
return get_group_id(dimindx) * get_local_size(dimindx); | |
} | |
// vector type folding | |
inline float sumVec1(const float v) { | |
return v; | |
} | |
inline float sumVec2(const float2 v) { | |
return v.x + v.y; | |
} | |
inline float sumVec4(const float4 v) { | |
return dot(v, (float4)1.0f); | |
} | |
inline float sumVec8(const float8 v) { | |
return sumVec4(v.s0123 + v.s4567); | |
} | |
#define sumVecN(N, v) DLM_CONCAT(sumVec, N)((v)) | |
// ================================================ | |
// gram preferences | |
#if !defined(BLOCK_SIDEX) | |
#define BLOCK_SIDEX 8 | |
#endif | |
#if !defined(BLOCK_SIDEY) | |
#define BLOCK_SIDEY 8 | |
#endif | |
#if (BLOCK_SIDEY == BLOCK_SIDEX) | |
#define BLOCK_SIDE BLOCK_SIDEX | |
#endif | |
#define N DLM_ARCH_VECSZ_FLOAT | |
#define M 1 | |
#define BLOCK_SIZE (BLOCK_SIDEX * BLOCK_SIDEY) | |
#define DATA_WIDTH (M * BLOCK_SIZE) | |
typedef DLM_CONCAT(float, N) floatN; | |
// Gram matrix computation block algorithm | |
// group size: (BLOCK_SIDEX, BLOCK_SIDEY, 1) | |
// global size: should match with gram matrix size (n * n) | |
// | |
// params: | |
// n - implicit argument, equal get_global_id(i), i = {1,2}. | |
// number of vectors in input array & size of gram matrix | |
// vs - array of input vectors (n * vecSz). Vectors stored in memory sequentially | |
// output - gram matrix (n * n) | |
// vecSz - length of each input vector | |
// | |
// algorithm basics | |
// 1. each work-item computes appropriate element of gram matrix, | |
// 2. each group computes [BLOCK_SIDEX * BLOCK_SIDEY] block of gram matrix, | |
// starts with group_global_offset (see dlm_group_offset) gram's element in each dimension | |
// 3. Outer loop: | |
// work-item computes dot product of [DATA_WIDTH] elements each iteration | |
// 4. First inner loop: | |
// whole group reads [DATA_WIDTH] chunk of vector each iteration. | |
// After we have [BLOCK_SIDEX] & [BLOCK_SIDEY] chunks of vectors in local memory | |
// | |
__attribute__((vec_type_hint(floatN))) | |
__attribute__((work_group_size_hint(BLOCK_SIDEX, BLOCK_SIDEY, 1))) | |
__kernel void computeGramMrx( | |
DLM_STAT_SVM __global const float * __restrict const vs, | |
DLM_STAT_SVM __global float * __restrict const output, | |
const int vecSz) | |
{ | |
__local float xss[BLOCK_SIDEX][DATA_WIDTH + 1]; // avoid bank conflicts! | |
__local float yss[BLOCK_SIDEY][DATA_WIDTH + 1]; | |
const size_t localId = dlm_loc_lin_id(); | |
const size_t xBlockShift = dlm_group_offset(0) * vecSz + M * localId; | |
const size_t yBlockShift = dlm_group_offset(1) * vecSz + M * localId; | |
floatN s = (floatN) 0.0f; | |
for (size_t layer=0; layer < vecSz; layer += DATA_WIDTH) { | |
__global const float * layeredVs = &vs[layer]; | |
for (int i=0; i < BLOCK_SIDE; ++i, layeredVs += vecSz) { | |
vtransferSN( M, &layeredVs[xBlockShift], &xss[i][localId * M] ); | |
vtransferSN( M, &layeredVs[yBlockShift], &yss[i][localId * M] ); | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
for (int i=0; i < DATA_WIDTH; i += vec_step(s)) { | |
const floatN x = vloadSN( N, &xss[get_local_id(0)][i] ); | |
const floatN y = vloadSN( N, &yss[get_local_id(1)][i] ); | |
s += x * y; //mad(x,y,s); | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
output[dlm_glob_lin_id()] = sumVecN(N, s); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment