Skip to content

Instantly share code, notes, and snippets.

@PolarNick239
Created March 5, 2017 17:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save PolarNick239/096173b0a398b6933108f6025a87bdfd to your computer and use it in GitHub Desktop.
Save PolarNick239/096173b0a398b6933108f6025a87bdfd to your computer and use it in GitHub Desktop.
# http://codeforces.com/contest/769/problem/D
import time
import numpy as np
import pyopencl as cl
runs_number = 50
n, k = [int(x) for x in input("n k: ").split()]
values = input("Values (or empty line for random values): ")
if len(values) == 0:
np.random.seed(239)
xs = np.random.randint(0, 10001, n).astype(np.int32)
print("Values: {}".format(xs))
else:
xs = np.int32([int(x) for x in values.split()])
ctx = cl.create_some_context(interactive=True)
queue = cl.CommandQueue(ctx)
program = cl.Program(ctx, """
#line 20
__kernel void match(
__global const int *xs, int n, int k, __global int *res)
{
int global_i = get_global_id(0);
int local_i = get_local_id(1);
int group_size = get_local_size(1);
int itemRes = 0;
if (global_i < n) {
int x0 = xs[global_i];
for (int j = local_i; j < global_i; j += group_size) {
int x1 = xs[j];
if (popcount(x0 ^ x1) == k) {
itemRes++;
}
}
}
__local int groupRes[256];
int offset = get_local_id(0) * group_size;
groupRes[offset + local_i] = itemRes;
barrier(CLK_LOCAL_MEM_FENCE);
for (int step = 1; step < group_size; step *= 2) {
if (local_i % (2 * step) == 0) {
groupRes[offset + local_i] += groupRes[offset + local_i + step];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (local_i == 0) {
atomic_add(res, groupRes[offset + 0]);
}
}
""").build()
results = []
time_measures = []
for i in range(runs_number):
start = time.time()
mf = cl.mem_flags
xs_cl = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=xs)
res_cl = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=np.int32([0]))
work_size_0 = 4
work_size_1 = 256 // work_size_0
program.match(queue, ((n + work_size_0 - 1) // work_size_0 * work_size_0, work_size_1), (work_size_0, work_size_1), xs_cl, np.int32(n), np.int32(k), res_cl)
res_np = np.int32([-1])
cl.enqueue_copy(queue, res_np, res_cl)
passed = time.time() - start
print("Result: {}".format(res_np[0]))
print("Time: {:.4f} seconds".format(passed))
results.append(res_np[0])
time_measures.append(passed)
print("_____________________________________________")
if not np.all(np.array(results) == results[0]):
print("Some results differ!")
else:
time_measures = np.float32(time_measures)
print("Result: {}".format(results[0]))
print("Time: {:.4f}/{:.4f}/{:.4f} seconds (min/avg/max)".format(time_measures.min(), np.average(time_measures), time_measures.max()))
# Some time measuares on AMD R9 390X:
# n=10000: 0.0035 seconds
# n=100000: 0.0285 seconds
# n=1000000: 4.3 seconds (seems that here starts cache-misses, good idea is to separate execution to chunks with size of ~cache)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment