Skip to content

Instantly share code, notes, and snippets.

@fjarri
Last active August 29, 2015 14:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fjarri/27dfbee989d8792bda0c to your computer and use it in GitHub Desktop.
Save fjarri/27dfbee989d8792bda0c to your computer and use it in GitHub Desktop.
import time
import numpy
from reikna.cluda import cuda_api
from reikna.fft import FFT, FFTShift
import reikna.cluda.dtypes as dtypes
from reikna.core import Transformation, Parameter, Annotation, Type
def fftshift(arr_t, axes=None):
if axes is None:
axes = tuple(range(len(arr_t.shape)))
else:
axes = tuple(sorted(axes))
return Transformation(
[Parameter('output', Annotation(arr_t, 'o')),
Parameter('input', Annotation(arr_t, 'i'))],
"""
<%
dimensions = len(output.shape)
new_idx_names = ['new_idx' + str(i) for i in range(dimensions)]
%>
%for dim in range(dimensions):
VSIZE_T ${new_idx_names[dim]} =
${idxs[dim]}
%if dim in axes:
%if output.shape[dim] % 2 == 0:
+ (${idxs[dim]} < ${output.shape[dim] // 2} ?
${output.shape[dim] // 2} :
${-output.shape[dim] // 2})
%else:
+ (${idxs[dim]} <= ${output.shape[dim] // 2} ?
${output.shape[dim] // 2} :
${-(output.shape[dim] // 2 + 1)})
%endif
%endif
;
%endfor
${output.ctype} val = ${input.load_same};
${output.store_idx}(${', '.join(new_idx_names)}, val);
""",
render_kwds=dict(
axes=axes))
def run_test(thr, shape, dtype, axes=None):
data = numpy.random.normal(size=shape).astype(dtype)
fft = FFT(data, axes=axes)
fftc = fft.compile(thr)
shift = FFTShift(data, axes=axes)
shiftc = shift.compile(thr)
# separate calculation
data_dev = thr.to_device(data)
t_start = time.time()
fftc(data_dev, data_dev)
thr.synchronize()
t_gpu_fft = time.time() - t_start
t_start = time.time()
shiftc(data_dev, data_dev)
thr.synchronize()
t_gpu_shift = time.time() - t_start
data_dev = thr.to_device(data)
t_start = time.time()
fftc(data_dev, data_dev)
shiftc(data_dev, data_dev)
thr.synchronize()
t_gpu_separate = time.time() - t_start
# transformation
data_dev2 = thr.to_device(data)
shift_tr = fftshift(data, axes=axes)
fft2 = fft.parameter.output.connect(shift_tr, shift_tr.input, new_output=shift_tr.output)
fft2c = fft2.compile(thr)
t_start = time.time()
fft2c(data_dev2, data_dev2)
thr.synchronize()
t_gpu_combined = time.time() - t_start
# reference
t_start = time.time()
numpy.fft.fftn(data, axes=axes)
t_cpu_fft = time.time() - t_start
t_start = time.time()
numpy.fft.fftshift(data, axes=axes)
t_cpu_shift = time.time() - t_start
t_start = time.time()
data_ref = numpy.fft.fftn(data, axes=axes)
data_ref = numpy.fft.fftshift(data_ref, axes=axes)
t_cpu_all = time.time() - t_start
data_gpu = data_dev.get()
data_gpu2 = data_dev2.get()
assert numpy.allclose(data_ref, data_gpu)
assert numpy.allclose(data_ref, data_gpu2)
return dict(
t_gpu_fft=t_gpu_fft,
t_gpu_shift=t_gpu_shift,
t_gpu_separate=t_gpu_separate,
t_gpu_combined=t_gpu_combined,
t_cpu_fft=t_cpu_fft,
t_cpu_shift=t_cpu_shift,
t_cpu_all=t_cpu_all)
def run_tests(thr, shape, dtype, axes=None, attempts=10):
results = [run_test(thr, shape, dtype, axes=axes) for i in range(attempts)]
return {key:min(result[key] for result in results) for key in results[0]}
if __name__ == '__main__':
api = cuda_api()
thr = api.Thread.create()
shape = (1024, 1024)
dtype = numpy.complex128
axes = (1,)
results = run_tests(thr, shape, dtype, axes=axes)
print('device:', thr._device.name)
print('shape:', shape)
print('dtype:', dtype)
print('axes:', axes)
for key, val in results.items():
print(key, ':', val)
print(
"Speedup for a separate calculation:",
results['t_cpu_all'] / results['t_gpu_separate'])
print(
"Speedup for a combined calculation:",
results['t_cpu_all'] / results['t_gpu_combined'])
print(
"Speedup for fft alone:",
results['t_cpu_fft'] / results['t_gpu_fft'])
print(
"Speedup for shift alone:",
results['t_cpu_shift'] / results['t_gpu_shift'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment