Skip to content

Instantly share code, notes, and snippets.

@haampie
Created December 3, 2018 16:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save haampie/03e6af6569a09966eb4365fb0b1b51cc to your computer and use it in GitHub Desktop.
Save haampie/03e6af6569a09966eb4365fb0b1b51cc to your computer and use it in GitHub Desktop.
cerfacs.cu
// Computes y <- alpha * A * x + beta * y for tall and skinny A.
// Compile with `nvcc -O3 -o cerfacs cerfacs.cu`
// Assumes we have a *large* basis of COLS = 100 columns, you can play with this param
// Timing is measured without copies from / to device (copies should not happen in a good impl of arnoldi anyways)
// Assumes a fixed number of 256 threads per block.
#include <stdio.h>
#include <sys/time.h>
#define COLS 100
#define THREADS 256
#define GET_TIME(now) { \
struct timeval t; \
gettimeofday(&t, NULL); \
now = t.tv_sec + t.tv_usec/1000000.0; \
}
texture<float, 1, cudaReadModeElementType> tex_x;
__device__ void warpReduce(volatile float * s, int tid)
{
s[tid] += s[tid + 32];
s[tid] += s[tid + 16];
s[tid] += s[tid + 8];
s[tid] += s[tid + 4];
s[tid] += s[tid + 2];
s[tid] += s[tid + 1];
}
__global__
void sgemvt(int rows, int cols, float * y, float * A, float *x, float alpha, float beta)
{
// Assume there are <= 256 threads
__shared__ float s[THREADS];
int tid = threadIdx.x;
// Part of the MV product
float tmp = 0.0;
for (int i = tid, j = blockIdx.x * rows + tid; i < rows; i += THREADS, j += THREADS) {
tmp += A[j] * tex1Dfetch(tex_x, i);
}
s[tid] = tmp;
__syncthreads();
// Reduction
if (tid < 128) {
s[tid] += s[tid + 128];
}
__syncthreads();
if (tid < 64) {
s[tid] += s[tid + 64];
}
__syncthreads();
if (tid < 32) {
warpReduce(s, tid);
}
// Store in y
if (tid == 0) {
y[blockIdx.x] = alpha * s[0] + beta * y[blockIdx.x];
}
}
// For reference on the CPU
void sgemvt_serial(int rows, int cols, float * y, float * A, float * x, float alpha, float beta)
{
for (int col = 0; col < cols; ++col) {
float s = 0.0;
for (int row = 0; row < rows; ++row) {
s += A[col * rows + row] * x[row];
}
y[col] = alpha * s + beta * y[col];
}
}
int main()
{
const unsigned int rows = 11948;
const unsigned int cols = COLS;
const unsigned int n = rows * cols;
float * A = (float*) malloc(n * sizeof(float));
float * x = (float*) malloc(rows * sizeof(float));
float * y_original = (float*) malloc(cols * sizeof(float));
float * y1 = (float*) malloc(cols * sizeof(float));
float * y2 = (float*) malloc(cols * sizeof(float));
for (unsigned int i = 0; i < n; ++i) {
A[i] = ((float)rand()) / ((float)RAND_MAX);
}
for (unsigned int i = 0; i < rows; ++i) {
x[i] = ((float)rand()) / ((float)RAND_MAX);
}
for (unsigned int i = 0; i < cols; ++i) {
y_original[i] = ((float)rand()) / ((float)RAND_MAX);
}
// Do it on the CPU:
memcpy(y1, y_original, cols * sizeof(float));
sgemvt_serial(rows, cols, y1, A, x, 1.0, 1.0);
// Do it on the GPU:
double start, finish;
float * d_A;
float * d_x;
float * d_y;
cudaMalloc(&d_A, n * sizeof(float));
cudaMalloc(&d_x, rows * sizeof(float));
cudaMalloc(&d_y, cols * sizeof(float));
cudaMemcpy(d_A, A, n * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_x, x, rows * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y_original, cols * sizeof(float), cudaMemcpyHostToDevice);
cudaBindTexture(0, tex_x, d_x, rows * sizeof(float));
GET_TIME(start);
sgemvt<<<COLS,THREADS>>>(rows, cols, d_y, d_A, d_x, 1.0, 1.0);
GET_TIME(finish);
cudaMemcpy(y2, d_y, cols * sizeof(float), cudaMemcpyDeviceToHost);
printf("GPU: %e seconds\n", finish - start);
// Check norm of diff
float diff = 0.0;
for (unsigned i = 0; i < cols; ++i) {
diff += (y1[i] - y2[i]) * (y1[i] - y2[i]);
}
printf("norm(y_cpu - y_gpu) = %e\n", sqrt(diff));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment