Created
March 5, 2017 17:38
-
-
Save PolarNick239/096173b0a398b6933108f6025a87bdfd to your computer and use it in GitHub Desktop.
Solution on OpenCL for http://codeforces.com/contest/769/problem/D
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# http://codeforces.com/contest/769/problem/D | |
import time | |
import numpy as np | |
import pyopencl as cl | |
runs_number = 50 | |
n, k = [int(x) for x in input("n k: ").split()] | |
values = input("Values (or empty line for random values): ") | |
if len(values) == 0: | |
np.random.seed(239) | |
xs = np.random.randint(0, 10001, n).astype(np.int32) | |
print("Values: {}".format(xs)) | |
else: | |
xs = np.int32([int(x) for x in values.split()]) | |
ctx = cl.create_some_context(interactive=True) | |
queue = cl.CommandQueue(ctx) | |
program = cl.Program(ctx, """ | |
#line 20 | |
__kernel void match( | |
__global const int *xs, int n, int k, __global int *res) | |
{ | |
int global_i = get_global_id(0); | |
int local_i = get_local_id(1); | |
int group_size = get_local_size(1); | |
int itemRes = 0; | |
if (global_i < n) { | |
int x0 = xs[global_i]; | |
for (int j = local_i; j < global_i; j += group_size) { | |
int x1 = xs[j]; | |
if (popcount(x0 ^ x1) == k) { | |
itemRes++; | |
} | |
} | |
} | |
__local int groupRes[256]; | |
int offset = get_local_id(0) * group_size; | |
groupRes[offset + local_i] = itemRes; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
for (int step = 1; step < group_size; step *= 2) { | |
if (local_i % (2 * step) == 0) { | |
groupRes[offset + local_i] += groupRes[offset + local_i + step]; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
if (local_i == 0) { | |
atomic_add(res, groupRes[offset + 0]); | |
} | |
} | |
""").build() | |
results = [] | |
time_measures = [] | |
for i in range(runs_number): | |
start = time.time() | |
mf = cl.mem_flags | |
xs_cl = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=xs) | |
res_cl = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=np.int32([0])) | |
work_size_0 = 4 | |
work_size_1 = 256 // work_size_0 | |
program.match(queue, ((n + work_size_0 - 1) // work_size_0 * work_size_0, work_size_1), (work_size_0, work_size_1), xs_cl, np.int32(n), np.int32(k), res_cl) | |
res_np = np.int32([-1]) | |
cl.enqueue_copy(queue, res_np, res_cl) | |
passed = time.time() - start | |
print("Result: {}".format(res_np[0])) | |
print("Time: {:.4f} seconds".format(passed)) | |
results.append(res_np[0]) | |
time_measures.append(passed) | |
print("_____________________________________________") | |
if not np.all(np.array(results) == results[0]): | |
print("Some results differ!") | |
else: | |
time_measures = np.float32(time_measures) | |
print("Result: {}".format(results[0])) | |
print("Time: {:.4f}/{:.4f}/{:.4f} seconds (min/avg/max)".format(time_measures.min(), np.average(time_measures), time_measures.max())) | |
# Some time measuares on AMD R9 390X: | |
# n=10000: 0.0035 seconds | |
# n=100000: 0.0285 seconds | |
# n=1000000: 4.3 seconds (seems that here starts cache-misses, good idea is to separate execution to chunks with size of ~cache) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment