Skip to content

Instantly share code, notes, and snippets.

@telegraphic
Last active November 10, 2021 20:31
Show Gist options
  • Save telegraphic/d61fc8c049f4dae6a8fc2a4c9e91f2e5 to your computer and use it in GitHub Desktop.
Save telegraphic/d61fc8c049f4dae6a8fc2a4c9e91f2e5 to your computer and use it in GitHub Desktop.
Use of CUDA DP4A instruction using PyCUDA
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
def compute_xcorr_cpu(d):
dc = d.astype('float32').view('complex64')
dc = dc.transpose((0,2,3,1)).copy()
xcorr_cpu = np.einsum('...i,...j', dc, np.conj(dc)).view('float32').astype('int32').sum(axis=-4)
return xcorr_cpu
mod = SourceModule("""
#include <stdio.h>
__forceinline__ __device__
void dp4a(int &c, const int &a, const int &b) {
#if __CUDA_ARCH__ >= 610
asm("dp4a.s32.s32 %0, %1, %2, %3;" : "+r"(c) : "r"(a), "r"(b), "r"(c));
#else
char4 &a4 = *((char4*)&a);
char4 &b4 = *((char4*)&b);
c += a4.x*b4.x;
c += a4.y*b4.y;
c += a4.z*b4.z;
c += a4.w*b4.w;
#endif
}
/*
cmult_dp4a -- Do complex conjugate multiply accumulate <A*Conj(B)>
Using two dp4a instructions. Takes 8-bit complex data
packed as a single 32-bit int [8R8I 8R8I].
For two complex numbers:
ab* = (ar + i*ai)(br + i*bi)
re(ab*) = ar*br + ai*bi
im(ab*) = ai*br - ar*bi
So use two dp4a to compute:
[a0r a0i a1r a1i].[b0r b0i b1r b1i] = Re(<ab*>)
[a0r a0i a1r a1i].[-b0i b0r -b1i b1r] = Im(<ab*>)
Where angled brackets denote time averaging (over 2x samples)
*/
__forceinline__ __device__
void cmult_dp4a(int &res_re, int &res_im, int &A, int &B) {
// Unpack 32-bit int into 8-bit
int8_t Bmod[4];
int8_t *b8 = (int8_t *)&B;
// Transpose for bmod
Bmod[0] = -b8[1];
Bmod[1] = b8[0];
Bmod[2] = -b8[3];
Bmod[3] = b8[2];
//int8_t *a8 = (int8_t *)&A;
//printf("A %d %d %d %d | B %d %d %d %d\\n", a8[0], a8[1], a8[2], a8[3], b8[0], b8[1], b8[2], b8[3]);
// Pack 8-bit to 32-bit
int &Bmodp = *((int *)&Bmod);
// Run complex multiply
dp4a(res_re, A, B);
dp4a(res_im, A, Bmodp);
}
__global__ void cplxConjOuterProduct
(int *data, int *xcorr, int N, int F, int T)
{
int x, y; // x not used
int idx, ia, ib;
// Setup thread indexes
x = threadIdx.x;
y = threadIdx.y;
int chan_offset_in = blockIdx.x * N * T/2;
int chan_offset_out = blockIdx.x * N * N * 2;
int ant_offset = T / 2; //x2 for complex, but /4 for packed
idx = 2*y + N*2*x + chan_offset_out; // Compute index for output array
for (int t = 0; t < T/2; t++) {
ia = ant_offset*x + chan_offset_in + t;
ib = ant_offset*y + chan_offset_in + t;
//printf("idx %d | x%d.y%d | A %dx%d\\n", idx, x, y, ia, ib);
cmult_dp4a(xcorr[idx], xcorr[idx+1], data[ia], data[ib]);
}
}
""")
# Create complex data
N = 12 # Number of antennas (2-32)
F = 320 # Number of frequency channels
T = 1024 # Number of time samples
# Create complex data
d = np.random.randint(64, size=(F, N, T, 2), dtype='int8')
xcorr = np.zeros((F, N, N*2), dtype='int32')
d_gpu = cuda.mem_alloc(d.nbytes)
xcorr_gpu = cuda.mem_alloc(xcorr.nbytes)
cuda.memcpy_htod(d_gpu, d)
cuda.memcpy_htod(xcorr_gpu, xcorr)
compute_xcorr_gpu = mod.get_function("cplxConjOuterProduct")
compute_xcorr_gpu(d_gpu, xcorr_gpu, np.int32(N), np.int32(F), np.int32(T), block=(N, N, 1), grid=(F,1,1))
cuda.memcpy_dtoh(xcorr, xcorr_gpu)
xcorr_cpu = compute_xcorr_cpu(d)
assert np.allclose(xcorr.squeeze(), xcorr_cpu.squeeze())
"""
# dp4a_example.py
Simple test of DP4A instruction using PyCUDA.
"""
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
def dp4a_cpu(a, b):
""" Apply DP4A instruction, python version
Take do a dot product, A*B on two arrays, then add
sets of four elements together. E.g, for two arrays:
a = [A B C D]
b = [E F G H]
returns AE + BF + CG + DH
Args:
a (np.array): Numpy array of 8-bit integers of length 4N
b (np.array): Numpy array of 8-bit integers of length 4N
Returns:
c (np.array): Numpy array of 32-bit integers of length N
"""
try:
assert str(a.dtype) == str(b.dtype) == 'int8'
assert len(a) % 4 == len(b) % 4 ==0
except AssertionError:
print("Error: This must be run on a pair of int8 numpy arrays,\
with 4N elements ineach.")
return 0
z = a.astype('int32') * b.astype('int32')
z = z.reshape((-1, 4)).sum(axis=-1)
return z
mod = SourceModule("""
__forceinline__ __device__
void dp4a(int &c, const int &a, const int &b) {
#if __CUDA_ARCH__ >= 610
asm("dp4a.s32.s32 %0, %1, %2, %3;" : "+r"(c) : "r"(a), "r"(b), "r"(c));
#else
char4 &a4 = *((char4*)&a);
char4 &b4 = *((char4*)&b);
c += a4.x*b4.x;
c += a4.y*b4.y;
c += a4.z*b4.z;
c += a4.w*b4.w;
#endif
}
__global__ void multDp4a(int *A, int *B, int *C, int nx)
{
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
dp4a(C[ix], A[ix], B[ix]);
}
""")
# Setup some test vectors
# c = dp4a(a, b), input is 8 bit, output is 32 bit
nx = 1024 * 4
a = np.random.randint(low=-127, high=128, size=nx, dtype='int8')
b = np.random.randint(low=-127, high=128, size=nx, dtype='int8')
c = np.zeros(nx/4, dtype='int32')
# Compute CPU reference
c_cpu = dp4a_cpu(a, b)
# Allocate memory on GPU
a_gpu = cuda.mem_alloc(a.nbytes)
b_gpu = cuda.mem_alloc(b.nbytes)
c_gpu = cuda.mem_alloc(c.nbytes)
# Copy to GPU
cuda.memcpy_htod(a_gpu, a)
cuda.memcpy_htod(b_gpu, b)
cuda.memcpy_htod(c_gpu, c)
# Run kernel on GPU
func = mod.get_function("multDp4a")
func(a_gpu, b_gpu, c_gpu, np.int32(nx), block=(nx/4, 1, 1))
# Copy back
cuda.memcpy_dtoh(c, c_gpu)
# Confirm against CPU version
assert np.allclose(c, c_cpu)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment