Created
December 3, 2018 16:30
-
-
Save haampie/03e6af6569a09966eb4365fb0b1b51cc to your computer and use it in GitHub Desktop.
cerfacs.cu
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Computes y <- alpha * A * x + beta * y for tall and skinny A. | |
// Compile with `nvcc -O3 -o cerfacs cerfacs.cu` | |
// Assumes we have a *large* basis of COLS = 100 columns, you can play with this param | |
// Timing is measured without copies from / to device (copies should not happen in a good impl of arnoldi anyways) | |
// Assumes a fixed number of 256 threads per block. | |
#include <stdio.h> | |
#include <sys/time.h> | |
#define COLS 100 | |
#define THREADS 256 | |
#define GET_TIME(now) { \ | |
struct timeval t; \ | |
gettimeofday(&t, NULL); \ | |
now = t.tv_sec + t.tv_usec/1000000.0; \ | |
} | |
texture<float, 1, cudaReadModeElementType> tex_x; | |
__device__ void warpReduce(volatile float * s, int tid) | |
{ | |
s[tid] += s[tid + 32]; | |
s[tid] += s[tid + 16]; | |
s[tid] += s[tid + 8]; | |
s[tid] += s[tid + 4]; | |
s[tid] += s[tid + 2]; | |
s[tid] += s[tid + 1]; | |
} | |
__global__ | |
void sgemvt(int rows, int cols, float * y, float * A, float *x, float alpha, float beta) | |
{ | |
// Assume there are <= 256 threads | |
__shared__ float s[THREADS]; | |
int tid = threadIdx.x; | |
// Part of the MV product | |
float tmp = 0.0; | |
for (int i = tid, j = blockIdx.x * rows + tid; i < rows; i += THREADS, j += THREADS) { | |
tmp += A[j] * tex1Dfetch(tex_x, i); | |
} | |
s[tid] = tmp; | |
__syncthreads(); | |
// Reduction | |
if (tid < 128) { | |
s[tid] += s[tid + 128]; | |
} | |
__syncthreads(); | |
if (tid < 64) { | |
s[tid] += s[tid + 64]; | |
} | |
__syncthreads(); | |
if (tid < 32) { | |
warpReduce(s, tid); | |
} | |
// Store in y | |
if (tid == 0) { | |
y[blockIdx.x] = alpha * s[0] + beta * y[blockIdx.x]; | |
} | |
} | |
// For reference on the CPU | |
void sgemvt_serial(int rows, int cols, float * y, float * A, float * x, float alpha, float beta) | |
{ | |
for (int col = 0; col < cols; ++col) { | |
float s = 0.0; | |
for (int row = 0; row < rows; ++row) { | |
s += A[col * rows + row] * x[row]; | |
} | |
y[col] = alpha * s + beta * y[col]; | |
} | |
} | |
int main() | |
{ | |
const unsigned int rows = 11948; | |
const unsigned int cols = COLS; | |
const unsigned int n = rows * cols; | |
float * A = (float*) malloc(n * sizeof(float)); | |
float * x = (float*) malloc(rows * sizeof(float)); | |
float * y_original = (float*) malloc(cols * sizeof(float)); | |
float * y1 = (float*) malloc(cols * sizeof(float)); | |
float * y2 = (float*) malloc(cols * sizeof(float)); | |
for (unsigned int i = 0; i < n; ++i) { | |
A[i] = ((float)rand()) / ((float)RAND_MAX); | |
} | |
for (unsigned int i = 0; i < rows; ++i) { | |
x[i] = ((float)rand()) / ((float)RAND_MAX); | |
} | |
for (unsigned int i = 0; i < cols; ++i) { | |
y_original[i] = ((float)rand()) / ((float)RAND_MAX); | |
} | |
// Do it on the CPU: | |
memcpy(y1, y_original, cols * sizeof(float)); | |
sgemvt_serial(rows, cols, y1, A, x, 1.0, 1.0); | |
// Do it on the GPU: | |
double start, finish; | |
float * d_A; | |
float * d_x; | |
float * d_y; | |
cudaMalloc(&d_A, n * sizeof(float)); | |
cudaMalloc(&d_x, rows * sizeof(float)); | |
cudaMalloc(&d_y, cols * sizeof(float)); | |
cudaMemcpy(d_A, A, n * sizeof(float), cudaMemcpyHostToDevice); | |
cudaMemcpy(d_x, x, rows * sizeof(float), cudaMemcpyHostToDevice); | |
cudaMemcpy(d_y, y_original, cols * sizeof(float), cudaMemcpyHostToDevice); | |
cudaBindTexture(0, tex_x, d_x, rows * sizeof(float)); | |
GET_TIME(start); | |
sgemvt<<<COLS,THREADS>>>(rows, cols, d_y, d_A, d_x, 1.0, 1.0); | |
GET_TIME(finish); | |
cudaMemcpy(y2, d_y, cols * sizeof(float), cudaMemcpyDeviceToHost); | |
printf("GPU: %e seconds\n", finish - start); | |
// Check norm of diff | |
float diff = 0.0; | |
for (unsigned i = 0; i < cols; ++i) { | |
diff += (y1[i] - y2[i]) * (y1[i] - y2[i]); | |
} | |
printf("norm(y_cpu - y_gpu) = %e\n", sqrt(diff)); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment