haampie/cerfacs.cu

## cerfacs.cu
// Computes y <- alpha * A * x + beta * y for tall and skinny A.
// Compile with `nvcc -O3 -o cerfacs cerfacs.cu`
// Assumes we have a *large* basis of COLS = 100 columns, you can play with this param
// Timing is measured without copies from / to device (copies should not happen in a good impl of arnoldi anyways)
// Assumes a fixed number of 256 threads per block.

#include <stdio.h>
#include <sys/time.h>

#define COLS 100
#define THREADS 256

#define GET_TIME(now) { \
   struct timeval t; \
   gettimeofday(&t, NULL); \
   now = t.tv_sec + t.tv_usec/1000000.0; \
}

texture<float, 1, cudaReadModeElementType> tex_x;

__device__ void warpReduce(volatile float * s, int tid)
{
    s[tid] += s[tid + 32];
    s[tid] += s[tid + 16];
    s[tid] += s[tid + 8];
    s[tid] += s[tid + 4];
    s[tid] += s[tid + 2];
    s[tid] += s[tid + 1];
}

__global__
void sgemvt(int rows, int cols, float * y, float * A, float *x, float alpha, float beta)
{
    // Assume there are <= 256 threads
    __shared__ float s[THREADS];

    int tid = threadIdx.x;

    // Part of the MV product
    float tmp = 0.0;
    for (int i = tid, j = blockIdx.x * rows + tid; i < rows; i += THREADS, j += THREADS) {
        tmp += A[j] * tex1Dfetch(tex_x, i);
    }
    s[tid] = tmp;
    __syncthreads();

    // Reduction
    if (tid < 128) {
        s[tid] += s[tid + 128];
    }

    __syncthreads();

    if (tid < 64) {
        s[tid] += s[tid + 64];
    }

    __syncthreads();

    if (tid < 32) {
        warpReduce(s, tid);
    }

    // Store in y
    if (tid == 0) {
        y[blockIdx.x] = alpha * s[0] + beta * y[blockIdx.x];
    }
}

// For reference on the CPU
void sgemvt_serial(int rows, int cols, float * y, float * A, float * x, float alpha, float beta)
{
    for (int col = 0; col < cols; ++col) {
        float s = 0.0;
        for (int row = 0; row < rows; ++row) {
            s += A[col * rows + row] * x[row];
        }
        y[col] = alpha * s + beta * y[col];
    }
}

int main()
{
    const unsigned int rows = 11948;
    const unsigned int cols = COLS;
    const unsigned int n = rows * cols;

    float * A = (float*) malloc(n * sizeof(float));
    float * x = (float*) malloc(rows * sizeof(float));
    float * y_original = (float*) malloc(cols * sizeof(float));
    float * y1 = (float*) malloc(cols * sizeof(float));
    float * y2 = (float*) malloc(cols * sizeof(float));

    for (unsigned int i = 0; i < n; ++i) {
        A[i] = ((float)rand()) / ((float)RAND_MAX);
    }

    for (unsigned int i = 0; i < rows; ++i) {
        x[i] = ((float)rand()) / ((float)RAND_MAX);
    }

    for (unsigned int i = 0; i < cols; ++i) {
        y_original[i] = ((float)rand()) / ((float)RAND_MAX);
    }

    // Do it on the CPU:
    memcpy(y1, y_original, cols * sizeof(float));
    sgemvt_serial(rows, cols, y1, A, x, 1.0, 1.0);

    // Do it on the GPU:
    double start, finish;
    float * d_A;
    float * d_x;
    float * d_y;
    cudaMalloc(&d_A, n * sizeof(float));
    cudaMalloc(&d_x, rows * sizeof(float));
    cudaMalloc(&d_y, cols * sizeof(float));
    cudaMemcpy(d_A, A, n * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_x, x, rows * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, y_original, cols * sizeof(float), cudaMemcpyHostToDevice);
    cudaBindTexture(0, tex_x, d_x, rows * sizeof(float));
    GET_TIME(start);
    sgemvt<<<COLS,THREADS>>>(rows, cols, d_y, d_A, d_x, 1.0, 1.0);
    GET_TIME(finish);
    cudaMemcpy(y2, d_y, cols * sizeof(float), cudaMemcpyDeviceToHost);

    printf("GPU: %e seconds\n", finish - start);

    // Check norm of diff
    float diff = 0.0;
    for (unsigned i = 0; i < cols; ++i) {
        diff += (y1[i] - y2[i]) * (y1[i] - y2[i]);
    }
    printf("norm(y_cpu - y_gpu) = %e\n", sqrt(diff));
}
	// Computes y <- alpha * A * x + beta * y for tall and skinny A.
	// Compile with `nvcc -O3 -o cerfacs cerfacs.cu`
	// Assumes we have a large basis of COLS = 100 columns, you can play with this param
	// Timing is measured without copies from / to device (copies should not happen in a good impl of arnoldi anyways)
	// Assumes a fixed number of 256 threads per block.

	#include <stdio.h>
	#include <sys/time.h>

	#define COLS 100
	#define THREADS 256

	#define GET_TIME(now) { \
	struct timeval t; \
	gettimeofday(&t, NULL); \
	now = t.tv_sec + t.tv_usec/1000000.0; \
	}

	texture<float, 1, cudaReadModeElementType> tex_x;

	__device__ void warpReduce(volatile float * s, int tid)
	{
	s[tid] += s[tid + 32];
	s[tid] += s[tid + 16];
	s[tid] += s[tid + 8];
	s[tid] += s[tid + 4];
	s[tid] += s[tid + 2];
	s[tid] += s[tid + 1];
	}

	__global__
	void sgemvt(int rows, int cols, float * y, float * A, float *x, float alpha, float beta)
	{
	// Assume there are <= 256 threads
	__shared__ float s[THREADS];

	int tid = threadIdx.x;

	// Part of the MV product
	float tmp = 0.0;
	for (int i = tid, j = blockIdx.x * rows + tid; i < rows; i += THREADS, j += THREADS) {
	tmp += A[j] * tex1Dfetch(tex_x, i);
	}
	s[tid] = tmp;
	__syncthreads();

	// Reduction
	if (tid < 128) {
	s[tid] += s[tid + 128];
	}

	__syncthreads();

	if (tid < 64) {
	s[tid] += s[tid + 64];
	}

	__syncthreads();

	if (tid < 32) {
	warpReduce(s, tid);
	}

	// Store in y
	if (tid == 0) {
	y[blockIdx.x] = alpha * s[0] + beta * y[blockIdx.x];
	}
	}

	// For reference on the CPU
	void sgemvt_serial(int rows, int cols, float * y, float * A, float * x, float alpha, float beta)
	{
	for (int col = 0; col < cols; ++col) {
	float s = 0.0;
	for (int row = 0; row < rows; ++row) {
	s += A[col * rows + row] * x[row];
	}
	y[col] = alpha * s + beta * y[col];
	}
	}

	int main()
	{
	const unsigned int rows = 11948;
	const unsigned int cols = COLS;
	const unsigned int n = rows * cols;

	float * A = (float) malloc(n sizeof(float));
	float * x = (float) malloc(rows sizeof(float));
	float * y_original = (float) malloc(cols sizeof(float));
	float * y1 = (float) malloc(cols sizeof(float));
	float * y2 = (float) malloc(cols sizeof(float));

	for (unsigned int i = 0; i < n; ++i) {
	A[i] = ((float)rand()) / ((float)RAND_MAX);
	}

	for (unsigned int i = 0; i < rows; ++i) {
	x[i] = ((float)rand()) / ((float)RAND_MAX);
	}

	for (unsigned int i = 0; i < cols; ++i) {
	y_original[i] = ((float)rand()) / ((float)RAND_MAX);
	}

	// Do it on the CPU:
	memcpy(y1, y_original, cols * sizeof(float));
	sgemvt_serial(rows, cols, y1, A, x, 1.0, 1.0);

	// Do it on the GPU:
	double start, finish;
	float * d_A;
	float * d_x;
	float * d_y;
	cudaMalloc(&d_A, n * sizeof(float));
	cudaMalloc(&d_x, rows * sizeof(float));
	cudaMalloc(&d_y, cols * sizeof(float));
	cudaMemcpy(d_A, A, n * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(d_x, x, rows * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(d_y, y_original, cols * sizeof(float), cudaMemcpyHostToDevice);
	cudaBindTexture(0, tex_x, d_x, rows * sizeof(float));
	GET_TIME(start);
	sgemvt<<<COLS,THREADS>>>(rows, cols, d_y, d_A, d_x, 1.0, 1.0);
	GET_TIME(finish);
	cudaMemcpy(y2, d_y, cols * sizeof(float), cudaMemcpyDeviceToHost);

	printf("GPU: %e seconds\n", finish - start);

	// Check norm of diff
	float diff = 0.0;
	for (unsigned i = 0; i < cols; ++i) {
	diff += (y1[i] - y2[i]) * (y1[i] - y2[i]);
	}
	printf("norm(y_cpu - y_gpu) = %e\n", sqrt(diff));
	}