Created
December 26, 2017 12:05
-
-
Save Lait-au-Cafe/964104c21e0ee2b6b00a0c79f02d33b8 to your computer and use it in GitHub Desktop.
Bitonic Sort
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pyopencl as cl | |
import numpy as np | |
ctx = cl.create_some_context() | |
queue = cl.CommandQueue(ctx) | |
length = np.int32(256) | |
labelElements = np.zeros(length * 2, dtype=np.int32) | |
length_Buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, length.nbytes, hostbuf=length) | |
sortedElements_Buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, labelElements.nbytes) | |
sequence = np.empty(2, dtype=np.int32) | |
sequence_Buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, sequence.nbytes) | |
code_bitonicSort = ''' | |
#define GROUP_SIZE_X 16 | |
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE_X, 1, 1))) | |
void firstSort( | |
__constant int *input, | |
__constant int *length, | |
__global int *result, | |
__global int *sequence | |
){ | |
int thid = get_global_id(0); | |
int grid = get_local_id(0); | |
int gridx = get_group_id(0); | |
__local int local_data[(GROUP_SIZE_X * 2) * 2]; | |
local_data[(2 * grid) * 2] = input[(2 * thid) * 2]; //index | |
local_data[(2 * grid) * 2 + 1] = input[(2 * thid) * 2 + 1]; //key | |
local_data[(2 * grid + 1) * 2] = input[(2 * thid + 1) * 2]; //index | |
local_data[(2 * grid + 1) * 2 + 1] = input[(2 * thid + 1) * 2 + 1];//key | |
barrier(CLK_LOCAL_MEM_FENCE); | |
__local int temp[GROUP_SIZE_X * 2]; | |
//========================================================================= | |
// Radix Sort | |
//========================================================================= | |
for(int k=0; k<32; k++){ | |
int bit1 = (local_data[(2 * grid) * 2 + 1] >> k) & 1; | |
int bit2 = (local_data[(2 * grid + 1) * 2 + 1] >> k) & 1; | |
temp[2 * grid] = bit1 ^ 1; //0と1を反転 | |
temp[2 * grid + 1] = bit2 ^ 1; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
int total = temp[(2 * GROUP_SIZE_X) - 1]; | |
//===================================================================== | |
// Scan | |
//===================================================================== | |
int offset = 1; | |
for(int i=(2 * GROUP_SIZE_X)>>1; i>0; i>>=1){ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if(grid < i){ | |
int ai = offset*(2*grid+1)-1; | |
int bi = offset*(2*grid+2)-1; | |
temp[bi]+=temp[ai]; | |
} | |
offset <<= 1; | |
} | |
if(grid==0) temp[(2 * GROUP_SIZE_X) - 1] = 0; | |
for(int i=1; i<(2 * GROUP_SIZE_X); i<<=1){ | |
offset >>= 1; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if(grid < i){ | |
int ai = offset*(2*grid+1)-1; | |
int bi = offset*(2*grid+2)-1; | |
int t = temp[ai]; | |
temp[ai] = temp[bi]; | |
temp[bi] += t; | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
total += temp[(2 * GROUP_SIZE_X) - 1]; | |
//===================================================================== | |
// Sort | |
//===================================================================== | |
int t1 = (2 * grid) - temp[(2 * grid)] + total; | |
int t2 = (2 * grid + 1) - temp[(2 * grid + 1)] + total; | |
temp[(2 * grid)] = bit1 ? t1 : temp[(2 * grid)]; | |
temp[(2 * grid + 1)] = bit2 ? t2 : temp[(2 * grid + 1)]; | |
int2 d1 = (int2)(local_data[(2 * grid) * 2], local_data[(2 * grid) * 2 + 1]); | |
int2 d2 = (int2)(local_data[(2 * grid + 1) * 2], local_data[(2 * grid + 1) * 2 + 1]); | |
barrier(CLK_LOCAL_MEM_FENCE); | |
local_data[temp[(2 * grid)] * 2] = d1.x; | |
local_data[temp[(2 * grid)] * 2 + 1] = d1.y; | |
local_data[temp[(2 * grid + 1)] * 2] = d2.x; | |
local_data[temp[(2 * grid + 1)] * 2 + 1] = d2.y; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
if(gridx%2==0){ | |
result[(2 * thid) * 2] = local_data[(2 * grid) * 2]; | |
result[(2 * thid) * 2 + 1] = local_data[(2 * grid) * 2 + 1]; | |
result[(2 * thid + 1) * 2] = local_data[(2 * grid + 1) * 2]; | |
result[(2 * thid + 1) * 2 + 1] = local_data[(2 * grid + 1) * 2 + 1]; | |
} | |
else{ | |
result[(2 * thid) * 2] = local_data[(2 * ((GROUP_SIZE_X - 1) - grid) + 1) * 2]; | |
result[(2 * thid) * 2 + 1] = local_data[(2 * ((GROUP_SIZE_X - 1) - grid) + 1) * 2 + 1]; | |
result[(2 * thid + 1) * 2] = local_data[(2 * ((GROUP_SIZE_X - 1) - grid)) * 2]; | |
result[(2 * thid + 1) * 2 + 1] = local_data[(2 * ((GROUP_SIZE_X - 1) - grid)) * 2 + 1]; | |
} | |
// シーケンスを設定 | |
if(thid==0){ | |
sequence[0] = 4; | |
sequence[1] = sequence[0] >> 1; | |
} | |
} | |
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE_X, 1, 1))) | |
void bitonicSort( | |
/*__constant int *input, */ | |
__constant int *length, | |
__global int *result, | |
__global int *sequence | |
){ | |
int thid = get_global_id(0); | |
int grid = get_local_id(0); | |
int gridx = get_group_id(0); | |
int dir = 1;//ソートの方向. 1で小->大, 0で大->小 | |
__local int local_data[(GROUP_SIZE_X * 2) * 2]; | |
__local int temp[GROUP_SIZE_X * 2]; | |
//========================================================================= | |
// ソートするブロックと方向を指定 | |
//========================================================================= | |
// 方向 | |
dir = ((int)(gridx / (sequence[0] >> 1)) & 1) ^ 1;// 1グループあたり2列を担当することに注意 | |
int seq[2] = {sequence[0], sequence[1]}; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
// 一つ目のブロック | |
local_data[grid * 2] | |
= result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1])) * GROUP_SIZE_X + grid) * 2]; //index | |
local_data[grid * 2 + 1] | |
= result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1])) * GROUP_SIZE_X + grid) * 2 + 1]; //key | |
// 二つ目のブロック | |
local_data[(grid + GROUP_SIZE_X) * 2] | |
= result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1]) + seq[1]) * GROUP_SIZE_X + grid) * 2]; //index | |
local_data[(grid + GROUP_SIZE_X) * 2 + 1] | |
= result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1]) + seq[1]) * GROUP_SIZE_X + grid) * 2 + 1];//key | |
barrier(CLK_LOCAL_MEM_FENCE); | |
for(int k=0; k<32; k++){ | |
int bit1 = (local_data[(2 * grid) * 2 + 1] >> k) & 1; | |
int bit2 = (local_data[(2 * grid + 1) * 2 + 1] >> k) & 1; | |
temp[2 * grid] = bit1 ^ 1; | |
temp[2 * grid + 1] = bit2 ^ 1; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
int total = temp[(2 * GROUP_SIZE_X) - 1]; | |
//===================================================================== | |
// Scan | |
//===================================================================== | |
int offset = 1; | |
for(int i=(2 * GROUP_SIZE_X)>>1; i>0; i>>=1){ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if(grid < i){ | |
int ai = offset*(2*grid+1)-1; | |
int bi = offset*(2*grid+2)-1; | |
temp[bi]+=temp[ai]; | |
} | |
offset <<= 1; | |
} | |
if(grid==0) temp[(2 * GROUP_SIZE_X) - 1] = 0; | |
for(int i=1; i<(2 * GROUP_SIZE_X); i<<=1){ | |
offset >>= 1; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if(grid < i){ | |
int ai = offset*(2*grid+1)-1; | |
int bi = offset*(2*grid+2)-1; | |
int t = temp[ai]; | |
temp[ai] = temp[bi]; | |
temp[bi] += t; | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
total += temp[(2 * GROUP_SIZE_X) - 1]; | |
//===================================================================== | |
// Sort | |
//===================================================================== | |
int t1 = (2 * grid) - temp[(2 * grid)] + total; | |
int t2 = (2 * grid + 1) - temp[(2 * grid + 1)] + total; | |
temp[(2 * grid)] = bit1 ? t1 : temp[(2 * grid)]; | |
temp[(2 * grid + 1)] = bit2 ? t2 : temp[(2 * grid + 1)]; | |
int2 d1 = (int2)(local_data[(2 * grid) * 2], local_data[(2 * grid) * 2 + 1]); | |
int2 d2 = (int2)(local_data[(2 * grid + 1) * 2], local_data[(2 * grid + 1) * 2 + 1]); | |
barrier(CLK_LOCAL_MEM_FENCE); | |
local_data[temp[(2 * grid)] * 2] = d1.x; | |
local_data[temp[(2 * grid)] * 2 + 1] = d1.y; | |
local_data[temp[(2 * grid + 1)] * 2] = d2.x; | |
local_data[temp[(2 * grid + 1)] * 2 + 1] = d2.y; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
//========================================================================= | |
// 結果を格納 | |
//========================================================================= | |
if(dir){ | |
// 一つ目のブロック | |
result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1])) * GROUP_SIZE_X + grid) * 2] | |
= local_data[grid * 2]; //index | |
result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1])) * GROUP_SIZE_X + grid) * 2 + 1] | |
= local_data[grid * 2 + 1]; //key | |
// 二つ目のブロック | |
result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1]) + seq[1]) * GROUP_SIZE_X + grid) * 2] | |
= local_data[(grid + GROUP_SIZE_X) * 2]; //index | |
result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1]) + seq[1]) * GROUP_SIZE_X + grid) * 2 + 1] | |
= local_data[(grid + GROUP_SIZE_X) * 2 + 1]; //key | |
} | |
else{ | |
// 上と同様(逆順) | |
result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1])) * GROUP_SIZE_X + grid) * 2] | |
= local_data[(2 * GROUP_SIZE_X - 1 - grid) * 2]; | |
result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1])) * GROUP_SIZE_X + grid) * 2 + 1] | |
= local_data[(2 * GROUP_SIZE_X - 1 - grid) * 2 + 1]; | |
result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1]) + seq[1]) * GROUP_SIZE_X + grid) * 2] | |
= local_data[(GROUP_SIZE_X - 1 - grid) * 2]; | |
result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1]) + seq[1]) * GROUP_SIZE_X + grid) * 2 + 1] | |
= local_data[(GROUP_SIZE_X - 1 - grid) * 2 + 1]; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
// シーケンスの更新 | |
if(thid == 0){ | |
if(sequence[1] > 1){ | |
sequence[1] >>= 1; | |
} | |
else{ | |
sequence[0] <<= 1; | |
sequence[1] = sequence[0] >> 1; | |
} | |
} | |
} | |
''' | |
prg_bitonicSort = cl.Program(ctx, code_bitonicSort).build() | |
randomArray = np.empty(2*length, dtype=np.int32) | |
randomArray[0:length*2:2] = np.arange(1, length+1, 1) | |
randomArray[1:length*2+1:2] = np.random.randint(0, 1000, length) | |
print(randomArray) | |
randomArray_Buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, randomArray.nbytes, hostbuf=randomArray) | |
event_firstSort = prg_bitonicSort.firstSort(queue, | |
[length // 2], [16], | |
randomArray_Buf, length_Buf, sortedElements_Buf, sequence_Buf) | |
event_firstSort.wait() | |
sortedElements = np.empty(labelElements.shape, dtype=np.int32) | |
cl.enqueue_copy(queue, sortedElements, sortedElements_Buf) | |
print(sortedElements) | |
for i in range(sum(range(2, int(np.log2(length//16)) + 1))): | |
event_bitonicSort = prg_bitonicSort.bitonicSort(queue, | |
[length // 2], [16], | |
length_Buf, sortedElements_Buf, sequence_Buf) | |
event_bitonicSort.wait() | |
cl.enqueue_copy(queue, sortedElements, sortedElements_Buf) | |
print(sortedElements) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment