Last active
August 29, 2015 14:06
-
-
Save fjarri/6ff4400cde2750e61bff to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Exercise 1 from http://webapp.dam.brown.edu/wiki/SciComp/CudaExercises | |
# Transposition of a matrix | |
# by Hendrik Riedmann <riedmann@dam.brown.edu> | |
from __future__ import division, print_function | |
import pycuda.driver as cuda | |
import pycuda.gpuarray as gpuarray | |
import pycuda.autoinit | |
from pycuda.compiler import SourceModule | |
import numpy | |
import numpy.linalg as la | |
from pycuda.tools import context_dependent_memoize | |
block_size = 16 | |
@context_dependent_memoize | |
def _get_transpose_kernel(): | |
mod = SourceModule(""" | |
#define BLOCK_SIZE %(block_size)d | |
#define A_BLOCK_STRIDE (BLOCK_SIZE * a_width) | |
#define A_T_BLOCK_STRIDE (BLOCK_SIZE * a_height) | |
__global__ void transpose(float *A_t, float *A, int a_width, int a_height) | |
{ | |
// Base indices in A and A_t | |
int base_idx_a = blockIdx.x * BLOCK_SIZE + | |
blockIdx.y * A_BLOCK_STRIDE; | |
int base_idx_a_t = blockIdx.y * BLOCK_SIZE + | |
blockIdx.x * A_T_BLOCK_STRIDE; | |
// Global indices in A and A_t | |
int glob_idx_a = base_idx_a + threadIdx.x + a_width * threadIdx.y; | |
int glob_idx_a_t = base_idx_a_t + threadIdx.x + a_height * threadIdx.y; | |
__shared__ float A_shared[BLOCK_SIZE][BLOCK_SIZE+1]; | |
// Store transposed submatrix to shared memory | |
A_shared[threadIdx.y][threadIdx.x] = A[glob_idx_a]; | |
__syncthreads(); | |
// Write transposed submatrix to global memory | |
A_t[glob_idx_a_t] = A_shared[threadIdx.x][threadIdx.y]; | |
} | |
"""% {"block_size": block_size}) | |
func = mod.get_function("transpose") | |
func.prepare("PPii") | |
from pytools import Record | |
class TransposeKernelInfo(Record): pass | |
return TransposeKernelInfo(func=func, | |
block=(block_size, block_size, 1), | |
block_size=block_size, | |
granularity=block_size) | |
def _get_big_block_transpose_kernel(): | |
mod = SourceModule(""" | |
#define BLOCK_SIZE %(block_size)d | |
#define A_BLOCK_STRIDE (BLOCK_SIZE * a_width) | |
#define A_T_BLOCK_STRIDE (BLOCK_SIZE * a_height) | |
__global__ void transpose(float *A, float *A_t, int a_width, int a_height) | |
{ | |
// Base indices in A and A_t | |
int base_idx_a = 2 * blockIdx.x * BLOCK_SIZE + | |
2 * blockIdx.y * A_BLOCK_STRIDE; | |
int base_idx_a_t = 2 * blockIdx.y * BLOCK_SIZE + | |
2 * blockIdx.x * A_T_BLOCK_STRIDE; | |
// Global indices in A and A_t | |
int glob_idx_a = base_idx_a + threadIdx.x + a_width * threadIdx.y; | |
int glob_idx_a_t = base_idx_a_t + threadIdx.x + a_height * threadIdx.y; | |
__shared__ float A_shared[2 * BLOCK_SIZE][2 * BLOCK_SIZE + 1]; | |
// Store transposed submatrix to shared memory | |
A_shared[threadIdx.y][threadIdx.x] = A[glob_idx_a]; | |
A_shared[threadIdx.y][threadIdx.x + BLOCK_SIZE] = | |
A[glob_idx_a + A_BLOCK_STRIDE]; | |
A_shared[threadIdx.y + BLOCK_SIZE][threadIdx.x] = | |
A[glob_idx_a + BLOCK_SIZE]; | |
A_shared[threadIdx.y + BLOCK_SIZE][threadIdx.x + BLOCK_SIZE] = | |
A[glob_idx_a + BLOCK_SIZE + A_BLOCK_STRIDE]; | |
__syncthreads(); | |
// Write transposed submatrix to global memory | |
A_t[glob_idx_a_t] = A_shared[threadIdx.x][threadIdx.y]; | |
A_t[glob_idx_a_t + A_T_BLOCK_STRIDE] = | |
A_shared[threadIdx.x + BLOCK_SIZE][threadIdx.y]; | |
A_t[glob_idx_a_t + BLOCK_SIZE] = | |
A_shared[threadIdx.x][threadIdx.y + BLOCK_SIZE]; | |
A_t[glob_idx_a_t + A_T_BLOCK_STRIDE + BLOCK_SIZE] = | |
A_shared[threadIdx.x + BLOCK_SIZE][threadIdx.y + BLOCK_SIZE]; | |
} | |
"""% {"block_size": block_size}) | |
func = mod.get_function("transpose") | |
func.prepare("PPii") | |
from pytools import Record | |
class TransposeKernelInfo(Record): pass | |
return TransposeKernelInfo(func=func, | |
block=(block_size, block_size, 1), | |
block_size=block_size, | |
granularity=2*block_size) | |
def _transpose(tgt, src): | |
krnl = _get_transpose_kernel() | |
w, h = src.shape | |
assert tgt.shape == (h, w) | |
assert w % krnl.granularity == 0 | |
assert h % krnl.granularity == 0 | |
krnl.func.prepared_call( | |
(w // krnl.granularity, h // krnl.granularity), krnl.block, | |
tgt.gpudata, src.gpudata, w, h) | |
def transpose(src): | |
w, h = src.shape | |
result = gpuarray.empty((h, w), dtype=src.dtype) | |
_transpose(result, src) | |
return result | |
def check_transpose(): | |
from pycuda.curandom import rand | |
for i in numpy.arange(10, 13, 0.125): | |
size = int(((2**i) // 32) * 32) | |
print(size) | |
source = rand((size, size), dtype=numpy.float32) | |
result = transpose(source) | |
err = source.get().T - result.get() | |
err_norm = la.norm(err) | |
source.gpudata.free() | |
result.gpudata.free() | |
assert err_norm == 0, (size, err_norm) | |
def run_benchmark(): | |
from pycuda.curandom import rand | |
powers = numpy.arange(10, 13, 2**(-6)) | |
sizes = [int(size) for size in numpy.unique(2**powers // 16 * 16)] | |
bandwidths = [] | |
times = [] | |
for size in sizes: | |
source = rand((size, size), dtype=numpy.float32) | |
target = gpuarray.empty((size, size), dtype=source.dtype) | |
start = pycuda.driver.Event() | |
stop = pycuda.driver.Event() | |
warmup = 2 | |
for i in range(warmup): | |
_transpose(target, source) | |
count = 10 | |
cuda.Context.synchronize() | |
start.record() | |
for i in range(count): | |
_transpose(target, source) | |
stop.record() | |
stop.synchronize() | |
elapsed_seconds = stop.time_since(start)*1e-3 | |
mem_bw = source.nbytes / elapsed_seconds * 2 * count | |
bandwidths.append(mem_bw) | |
times.append(elapsed_seconds) | |
slow_sizes = [s for s, bw in zip(sizes, bandwidths) if bw < 40e9] | |
print("Sizes for which bandwidth was low:", slow_sizes) | |
print("Ditto, mod 64:", [s % 64 for s in slow_sizes]) | |
from matplotlib.pyplot import semilogx, loglog, show, savefig, clf, xlabel, ylabel | |
xlabel('matrix size') | |
ylabel('bandwidth') | |
semilogx(sizes, bandwidths) | |
savefig("transpose-bw.png") | |
clf() | |
xlabel('matrix size') | |
ylabel('time') | |
loglog(sizes, times) | |
savefig("transpose-times.png") | |
#check_transpose() | |
run_benchmark() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment