Skip to content

Instantly share code, notes, and snippets.

@belltailjp
Last active October 19, 2017 14:33
Show Gist options
  • Save belltailjp/f33f3bafb70865ec13fc314a632a4928 to your computer and use it in GitHub Desktop.
Save belltailjp/f33f3bafb70865ec13fc314a632a4928 to your computer and use it in GitHub Desktop.
Draw 2D Gaussian Image using Cupy
import time
def measure_time_in_ms(func, n_times, size, sigma):
start = time.time()
for _ in range(n_times):
out = func(size, sigma)
assert(out.shape == (size, size))
elapsed_sec = time.time() - start
return elapsed_sec / n_times * 1000
sizes = [2 ** t for t in range(3, 15)]
n_times = 50
print('size numpy cupy cupy_eltw numpy_sendtoGPU')
for size in sizes:
# Kernel build will run at the first call, so exclude it from time measurement
gen_2d_gaussian_cupy(size, size / 5)
gen_2d_gaussian_cupy_elementwise(size, size / 5)
numpy_ms = measure_time_in_ms(gen_2d_gaussian_numpy, n_times, size, size / 5)
cupy_ms = measure_time_in_ms(gen_2d_gaussian_cupy, n_times, size, size / 5)
cupy_eltw_ms = measure_time_in_ms(gen_2d_gaussian_cupy_elementwise, n_times, size, size / 5)
numpy_togpu_ms = measure_time_in_ms(gen_2d_gaussian_numpy_sendtoGPU, n_times, size, size / 5)
print(f'{size} {numpy_ms} {cupy_ms} {cupy_eltw_ms} {numpy_togpu_ms}')
#include <iostream>
#include <chrono>
__global__ void gen_2d_gaussian_kernel(float *buf, int size, float sigma, int cx, int cy)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if(x < 0 || size <= x || y < 0 || size <= y) return;
const int dx = x - cx, dy = y - cy;
const float d = sqrtf(dx * dx + dy * dy);
buf[y * size + x] = expf(-d * d / (2 * sigma * sigma));
}
void gen_2d_gaussian(float *buf_gpu, int size, float sigma)
{
cudaStream_t stream;
cudaStreamCreate(&stream);
if(size <= 32)
{
dim3 threads(size, size);
gen_2d_gaussian_kernel<<<1, threads, 0, stream>>>(buf_gpu, size, sigma, size / 2, size / 2);
}else{
dim3 threads(32, 32);
dim3 block(size / 32, size / 32);
gen_2d_gaussian_kernel<<<block, threads, 0, stream>>>(buf_gpu, size, sigma, size / 2, size / 2);
}
cudaStreamSynchronize(stream);
cudaStreamDestroy(stream);
}
double measure_time_in_ms(int size, int n_times, float sigma)
{
auto start = std::chrono::high_resolution_clock::now();
for(int i = 0; i < n_times; ++i)
{
float *buf;
cudaMalloc(&buf, sizeof(float) * size * size);
gen_2d_gaussian(buf, size, sigma);
cudaFree(buf);
}
auto duration = std::chrono::high_resolution_clock::now() - start;
return std::chrono::duration_cast<std::chrono::microseconds>(duration).count() / 1000.0 / n_times;
}
int main()
{
for(int size = 8; size <= 16384; size *= 2)
{
measure_time_in_ms(size, 100, size / 5.0f); // To eliminate first kernel call overhead
std::cout << size << " " << measure_time_in_ms(size, 50, size / 5.0f) << "ms" << std::endl;
}
}
import cupy
def gen_2d_gaussian_cupy(size, sigma):
x = cupy.arange(0, size, dtype=cupy.float32)
gaussian_1d = cupy.exp(-(x - size / 2)**2.0 / (2 * sigma**2))
return cupy.outer(gaussian_1d, gaussian_1d)
import cupy
# Create the instance only once, in order to prevent building kernel every time the function is called
kernel = cupy.ElementwiseKernel('T x, T y, T cx, T cy, T sigma',
'T out',
'''
T dx = x - cx, dy = y - cy;
T d = sqrtf(dx * dx + dy * dy);
out = expf(-d * d / (2 * sigma * sigma));
''',
'gen_2d_gaussian')
def gen_2d_gaussian_cupy_elementwise(size, sigma):
rng = cupy.arange(size, dtype=cupy.float32)
x, y = cupy.meshgrid(rng, rng)
return kernel(x, y, size / 2, size / 2, sigma)
import numpy, scipy.signal
def gen_2d_gaussian_numpy(size, sigma):
ar = scipy.signal.gaussian(size, std=sigma).astype(numpy.float32)
return numpy.outer(ar, ar)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment