Skip to content

Instantly share code, notes, and snippets.

@Lait-au-Cafe
Created December 26, 2017 12:05
Show Gist options
  • Save Lait-au-Cafe/964104c21e0ee2b6b00a0c79f02d33b8 to your computer and use it in GitHub Desktop.
Save Lait-au-Cafe/964104c21e0ee2b6b00a0c79f02d33b8 to your computer and use it in GitHub Desktop.
Bitonic Sort
import pyopencl as cl
import numpy as np
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
length = np.int32(256)
labelElements = np.zeros(length * 2, dtype=np.int32)
length_Buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, length.nbytes, hostbuf=length)
sortedElements_Buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, labelElements.nbytes)
sequence = np.empty(2, dtype=np.int32)
sequence_Buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, sequence.nbytes)
code_bitonicSort = '''
#define GROUP_SIZE_X 16
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE_X, 1, 1)))
void firstSort(
__constant int *input,
__constant int *length,
__global int *result,
__global int *sequence
){
int thid = get_global_id(0);
int grid = get_local_id(0);
int gridx = get_group_id(0);
__local int local_data[(GROUP_SIZE_X * 2) * 2];
local_data[(2 * grid) * 2] = input[(2 * thid) * 2]; //index
local_data[(2 * grid) * 2 + 1] = input[(2 * thid) * 2 + 1]; //key
local_data[(2 * grid + 1) * 2] = input[(2 * thid + 1) * 2]; //index
local_data[(2 * grid + 1) * 2 + 1] = input[(2 * thid + 1) * 2 + 1];//key
barrier(CLK_LOCAL_MEM_FENCE);
__local int temp[GROUP_SIZE_X * 2];
//=========================================================================
// Radix Sort
//=========================================================================
for(int k=0; k<32; k++){
int bit1 = (local_data[(2 * grid) * 2 + 1] >> k) & 1;
int bit2 = (local_data[(2 * grid + 1) * 2 + 1] >> k) & 1;
temp[2 * grid] = bit1 ^ 1; //0と1を反転
temp[2 * grid + 1] = bit2 ^ 1;
barrier(CLK_LOCAL_MEM_FENCE);
int total = temp[(2 * GROUP_SIZE_X) - 1];
//=====================================================================
// Scan
//=====================================================================
int offset = 1;
for(int i=(2 * GROUP_SIZE_X)>>1; i>0; i>>=1){
barrier(CLK_LOCAL_MEM_FENCE);
if(grid < i){
int ai = offset*(2*grid+1)-1;
int bi = offset*(2*grid+2)-1;
temp[bi]+=temp[ai];
}
offset <<= 1;
}
if(grid==0) temp[(2 * GROUP_SIZE_X) - 1] = 0;
for(int i=1; i<(2 * GROUP_SIZE_X); i<<=1){
offset >>= 1;
barrier(CLK_LOCAL_MEM_FENCE);
if(grid < i){
int ai = offset*(2*grid+1)-1;
int bi = offset*(2*grid+2)-1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
total += temp[(2 * GROUP_SIZE_X) - 1];
//=====================================================================
// Sort
//=====================================================================
int t1 = (2 * grid) - temp[(2 * grid)] + total;
int t2 = (2 * grid + 1) - temp[(2 * grid + 1)] + total;
temp[(2 * grid)] = bit1 ? t1 : temp[(2 * grid)];
temp[(2 * grid + 1)] = bit2 ? t2 : temp[(2 * grid + 1)];
int2 d1 = (int2)(local_data[(2 * grid) * 2], local_data[(2 * grid) * 2 + 1]);
int2 d2 = (int2)(local_data[(2 * grid + 1) * 2], local_data[(2 * grid + 1) * 2 + 1]);
barrier(CLK_LOCAL_MEM_FENCE);
local_data[temp[(2 * grid)] * 2] = d1.x;
local_data[temp[(2 * grid)] * 2 + 1] = d1.y;
local_data[temp[(2 * grid + 1)] * 2] = d2.x;
local_data[temp[(2 * grid + 1)] * 2 + 1] = d2.y;
barrier(CLK_LOCAL_MEM_FENCE);
}
if(gridx%2==0){
result[(2 * thid) * 2] = local_data[(2 * grid) * 2];
result[(2 * thid) * 2 + 1] = local_data[(2 * grid) * 2 + 1];
result[(2 * thid + 1) * 2] = local_data[(2 * grid + 1) * 2];
result[(2 * thid + 1) * 2 + 1] = local_data[(2 * grid + 1) * 2 + 1];
}
else{
result[(2 * thid) * 2] = local_data[(2 * ((GROUP_SIZE_X - 1) - grid) + 1) * 2];
result[(2 * thid) * 2 + 1] = local_data[(2 * ((GROUP_SIZE_X - 1) - grid) + 1) * 2 + 1];
result[(2 * thid + 1) * 2] = local_data[(2 * ((GROUP_SIZE_X - 1) - grid)) * 2];
result[(2 * thid + 1) * 2 + 1] = local_data[(2 * ((GROUP_SIZE_X - 1) - grid)) * 2 + 1];
}
// シーケンスを設定
if(thid==0){
sequence[0] = 4;
sequence[1] = sequence[0] >> 1;
}
}
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE_X, 1, 1)))
void bitonicSort(
/*__constant int *input, */
__constant int *length,
__global int *result,
__global int *sequence
){
int thid = get_global_id(0);
int grid = get_local_id(0);
int gridx = get_group_id(0);
int dir = 1;//ソートの方向. 1で小->大, 0で大->小
__local int local_data[(GROUP_SIZE_X * 2) * 2];
__local int temp[GROUP_SIZE_X * 2];
//=========================================================================
// ソートするブロックと方向を指定
//=========================================================================
// 方向
dir = ((int)(gridx / (sequence[0] >> 1)) & 1) ^ 1;// 1グループあたり2列を担当することに注意
int seq[2] = {sequence[0], sequence[1]};
barrier(CLK_LOCAL_MEM_FENCE);
// 一つ目のブロック
local_data[grid * 2]
= result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1])) * GROUP_SIZE_X + grid) * 2]; //index
local_data[grid * 2 + 1]
= result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1])) * GROUP_SIZE_X + grid) * 2 + 1]; //key
// 二つ目のブロック
local_data[(grid + GROUP_SIZE_X) * 2]
= result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1]) + seq[1]) * GROUP_SIZE_X + grid) * 2]; //index
local_data[(grid + GROUP_SIZE_X) * 2 + 1]
= result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1]) + seq[1]) * GROUP_SIZE_X + grid) * 2 + 1];//key
barrier(CLK_LOCAL_MEM_FENCE);
for(int k=0; k<32; k++){
int bit1 = (local_data[(2 * grid) * 2 + 1] >> k) & 1;
int bit2 = (local_data[(2 * grid + 1) * 2 + 1] >> k) & 1;
temp[2 * grid] = bit1 ^ 1;
temp[2 * grid + 1] = bit2 ^ 1;
barrier(CLK_LOCAL_MEM_FENCE);
int total = temp[(2 * GROUP_SIZE_X) - 1];
//=====================================================================
// Scan
//=====================================================================
int offset = 1;
for(int i=(2 * GROUP_SIZE_X)>>1; i>0; i>>=1){
barrier(CLK_LOCAL_MEM_FENCE);
if(grid < i){
int ai = offset*(2*grid+1)-1;
int bi = offset*(2*grid+2)-1;
temp[bi]+=temp[ai];
}
offset <<= 1;
}
if(grid==0) temp[(2 * GROUP_SIZE_X) - 1] = 0;
for(int i=1; i<(2 * GROUP_SIZE_X); i<<=1){
offset >>= 1;
barrier(CLK_LOCAL_MEM_FENCE);
if(grid < i){
int ai = offset*(2*grid+1)-1;
int bi = offset*(2*grid+2)-1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
total += temp[(2 * GROUP_SIZE_X) - 1];
//=====================================================================
// Sort
//=====================================================================
int t1 = (2 * grid) - temp[(2 * grid)] + total;
int t2 = (2 * grid + 1) - temp[(2 * grid + 1)] + total;
temp[(2 * grid)] = bit1 ? t1 : temp[(2 * grid)];
temp[(2 * grid + 1)] = bit2 ? t2 : temp[(2 * grid + 1)];
int2 d1 = (int2)(local_data[(2 * grid) * 2], local_data[(2 * grid) * 2 + 1]);
int2 d2 = (int2)(local_data[(2 * grid + 1) * 2], local_data[(2 * grid + 1) * 2 + 1]);
barrier(CLK_LOCAL_MEM_FENCE);
local_data[temp[(2 * grid)] * 2] = d1.x;
local_data[temp[(2 * grid)] * 2 + 1] = d1.y;
local_data[temp[(2 * grid + 1)] * 2] = d2.x;
local_data[temp[(2 * grid + 1)] * 2 + 1] = d2.y;
barrier(CLK_LOCAL_MEM_FENCE);
}
//=========================================================================
// 結果を格納
//=========================================================================
if(dir){
// 一つ目のブロック
result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1])) * GROUP_SIZE_X + grid) * 2]
= local_data[grid * 2]; //index
result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1])) * GROUP_SIZE_X + grid) * 2 + 1]
= local_data[grid * 2 + 1]; //key
// 二つ目のブロック
result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1]) + seq[1]) * GROUP_SIZE_X + grid) * 2]
= local_data[(grid + GROUP_SIZE_X) * 2]; //index
result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1]) + seq[1]) * GROUP_SIZE_X + grid) * 2 + 1]
= local_data[(grid + GROUP_SIZE_X) * 2 + 1]; //key
}
else{
// 上と同様(逆順)
result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1])) * GROUP_SIZE_X + grid) * 2]
= local_data[(2 * GROUP_SIZE_X - 1 - grid) * 2];
result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1])) * GROUP_SIZE_X + grid) * 2 + 1]
= local_data[(2 * GROUP_SIZE_X - 1 - grid) * 2 + 1];
result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1]) + seq[1]) * GROUP_SIZE_X + grid) * 2]
= local_data[(GROUP_SIZE_X - 1 - grid) * 2];
result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1]) + seq[1]) * GROUP_SIZE_X + grid) * 2 + 1]
= local_data[(GROUP_SIZE_X - 1 - grid) * 2 + 1];
}
barrier(CLK_LOCAL_MEM_FENCE);
// シーケンスの更新
if(thid == 0){
if(sequence[1] > 1){
sequence[1] >>= 1;
}
else{
sequence[0] <<= 1;
sequence[1] = sequence[0] >> 1;
}
}
}
'''
prg_bitonicSort = cl.Program(ctx, code_bitonicSort).build()
randomArray = np.empty(2*length, dtype=np.int32)
randomArray[0:length*2:2] = np.arange(1, length+1, 1)
randomArray[1:length*2+1:2] = np.random.randint(0, 1000, length)
print(randomArray)
randomArray_Buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, randomArray.nbytes, hostbuf=randomArray)
event_firstSort = prg_bitonicSort.firstSort(queue,
[length // 2], [16],
randomArray_Buf, length_Buf, sortedElements_Buf, sequence_Buf)
event_firstSort.wait()
sortedElements = np.empty(labelElements.shape, dtype=np.int32)
cl.enqueue_copy(queue, sortedElements, sortedElements_Buf)
print(sortedElements)
for i in range(sum(range(2, int(np.log2(length//16)) + 1))):
event_bitonicSort = prg_bitonicSort.bitonicSort(queue,
[length // 2], [16],
length_Buf, sortedElements_Buf, sequence_Buf)
event_bitonicSort.wait()
cl.enqueue_copy(queue, sortedElements, sortedElements_Buf)
print(sortedElements)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment