rinfz/a.cu

## a.cu
#include <cassert>
#include <cstdio>
#include <vector>

#include <gdal/gdal_priv.h>

__global__ void
vector_add(const int16_t* in, int16_t* out, int num_elements)
{
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < num_elements) {
    out[i] = in[i] + 1000;
  }
}

int
main()
{
  GDALAllRegister();

  GDALDataset *dataset = (GDALDataset*) GDALOpen("data/N33W099.hgt", GA_ReadOnly);
  GDALRasterBand *band = dataset->GetRasterBand(1);

  int x_size = band->GetXSize();
  int y_size = band->GetYSize();
  int num_elements = x_size * y_size;
  int size = num_elements * sizeof(int16_t);

  std::vector<int16_t> buf(num_elements);
  std::vector<int16_t> out(num_elements);

  auto gdal_err = band->RasterIO(GF_Read, 0, 0, x_size, y_size, buf.data(), x_size, y_size, GDT_Int16, 0, 0);

  if (gdal_err) {
    fprintf(stderr, "Failed to read dataset with GDAL\n");
    exit(EXIT_FAILURE);
  }

  GDALClose((GDALDatasetH) dataset);

  // END GDAL CODE

  cudaError_t err = cudaSuccess;

  // allocate device input
  int16_t *device_buf = nullptr;
  err = cudaMalloc((void **)&device_buf, size);

  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to allocate device buffer (error code: %s)\n", cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  // allocate device output buffer
  int16_t *device_out = nullptr;
  err = cudaMalloc((void **)&device_out, size);

  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to allocate device output buffer (error code: %s)\n", cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  // copy input buf from host to device
  err = cudaMemcpy(device_buf, buf.data(), size, cudaMemcpyHostToDevice);

  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to copy buf from host to device (error code: %s)\n", cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  int threads_per_block = 256;
  int blocks_per_grid = (num_elements + threads_per_block - 1) / threads_per_block;

  printf("CUDA kernel launch with %d blocks of %d threads\n", blocks_per_grid, threads_per_block);

  vector_add<<<blocks_per_grid, threads_per_block>>>(device_buf, device_out, num_elements);
  err = cudaGetLastError();

  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to launch vector_add kernel (error code: %s)\n", cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  // copy the device result from device memory to host memory
  err = cudaMemcpy(out.data(), device_out, size, cudaMemcpyDeviceToHost);

  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to copy result from device to host (error code: %s)\n", cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  for (int i = 0; i < num_elements; i++) {
    assert(buf[i] + 1000 == out[i]);
  }

  printf("Tests passed!\n");

  cudaFree(device_buf);
  cudaFree(device_out);

  printf("Done\n");

  return 0;
}

## b.cu
#include <cassert>
#include <chrono>
#include <cmath>
#include <cstdio>
#include <vector>

#include <gdal/gdal_priv.h>

__global__ void
vector_add(const int16_t* in, float* out, int num_elements, int ox, int oy, int res)
{
  int x = blockDim.x * blockIdx.x + threadIdx.x;
  int y = blockDim.y * blockIdx.y + threadIdx.y;

  int i = y * (gridDim.x * blockDim.x) + x;
  if (i < num_elements) {
    float distance = res * std::sqrt(std::pow(std::abs(ox - x), 2) + std::pow(std::abs(oy - y), 2));
    out[i] = distance;
  }
}

int
main()
{
  GDALAllRegister();

  GDALDataset *dataset = (GDALDataset*) GDALOpen("data/N33W099.hgt", GA_ReadOnly);
  GDALRasterBand *band = dataset->GetRasterBand(1);

  int x_size = band->GetXSize();
  int y_size = band->GetYSize();
  int num_elements = x_size * y_size;
  int size = num_elements * sizeof(int16_t);

  std::vector<int16_t> buf(num_elements);
  std::vector<float> out(num_elements);

  auto gdal_err = band->RasterIO(GF_Read, 0, 0, x_size, y_size, buf.data(), x_size, y_size, GDT_Int16, 0, 0);

  if (gdal_err) {
    fprintf(stderr, "Failed to read dataset with GDAL\n");
    exit(EXIT_FAILURE);
  }

  GDALClose((GDALDatasetH) dataset);

  // END GDAL CODE

  printf("Processing %d elements\n", num_elements);

  cudaError_t err = cudaSuccess;

  // allocate device input
  int16_t *device_buf = nullptr;
  err = cudaMalloc((void **)&device_buf, size);

  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to allocate device buffer (error code: %s)\n", cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  // allocate device output buffer
  float *device_out = nullptr;
  err = cudaMalloc((void **)&device_out, num_elements * sizeof(float));

  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to allocate device output buffer (error code: %s)\n", cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  // copy input buf from host to device
  err = cudaMemcpy(device_buf, buf.data(), size, cudaMemcpyHostToDevice);

  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to copy buf from host to device (error code: %s)\n", cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  dim3 threads_per_block(16, 16);
  dim3 blocks_per_grid(1 + x_size / threads_per_block.x, 1 + y_size / threads_per_block.y);

  auto start = std::chrono::high_resolution_clock::now();

  vector_add<<<blocks_per_grid, threads_per_block>>>(device_buf, device_out, num_elements, x_size / 2, y_size / 2, 30);

  auto dur = std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::high_resolution_clock::now() - start);
  printf("%fus\n", dur * 1'000'000);

  err = cudaGetLastError();

  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to launch vector_add kernel (error code: %s)\n", cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  // copy the device result from device memory to host memory
  err = cudaMemcpy(out.data(), device_out, size, cudaMemcpyDeviceToHost);

  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to copy result from device to host (error code: %s)\n", cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  cudaFree(device_buf);
  cudaFree(device_out);

  printf("Done\n");

  return 0;
}
	#include <cassert>
	#include <cstdio>
	#include <vector>

	#include <gdal/gdal_priv.h>

	__global__ void
	vector_add(const int16_t* in, int16_t* out, int num_elements)
	{
	int i = blockDim.x * blockIdx.x + threadIdx.x;
	if (i < num_elements) {
	out[i] = in[i] + 1000;
	}
	}

	int
	main()
	{
	GDALAllRegister();

	GDALDataset dataset = (GDALDataset) GDALOpen("data/N33W099.hgt", GA_ReadOnly);
	GDALRasterBand *band = dataset->GetRasterBand(1);

	int x_size = band->GetXSize();
	int y_size = band->GetYSize();
	int num_elements = x_size * y_size;
	int size = num_elements * sizeof(int16_t);

	std::vector<int16_t> buf(num_elements);
	std::vector<int16_t> out(num_elements);

	auto gdal_err = band->RasterIO(GF_Read, 0, 0, x_size, y_size, buf.data(), x_size, y_size, GDT_Int16, 0, 0);

	if (gdal_err) {
	fprintf(stderr, "Failed to read dataset with GDAL\n");
	exit(EXIT_FAILURE);
	}

	GDALClose((GDALDatasetH) dataset);

	// END GDAL CODE

	cudaError_t err = cudaSuccess;

	// allocate device input
	int16_t *device_buf = nullptr;
	err = cudaMalloc((void **)&device_buf, size);

	if (err != cudaSuccess) {
	fprintf(stderr, "Failed to allocate device buffer (error code: %s)\n", cudaGetErrorString(err));
	exit(EXIT_FAILURE);
	}

	// allocate device output buffer
	int16_t *device_out = nullptr;
	err = cudaMalloc((void **)&device_out, size);

	if (err != cudaSuccess) {
	fprintf(stderr, "Failed to allocate device output buffer (error code: %s)\n", cudaGetErrorString(err));
	exit(EXIT_FAILURE);
	}

	// copy input buf from host to device
	err = cudaMemcpy(device_buf, buf.data(), size, cudaMemcpyHostToDevice);

	if (err != cudaSuccess) {
	fprintf(stderr, "Failed to copy buf from host to device (error code: %s)\n", cudaGetErrorString(err));
	exit(EXIT_FAILURE);
	}

	int threads_per_block = 256;
	int blocks_per_grid = (num_elements + threads_per_block - 1) / threads_per_block;

	printf("CUDA kernel launch with %d blocks of %d threads\n", blocks_per_grid, threads_per_block);

	vector_add<<<blocks_per_grid, threads_per_block>>>(device_buf, device_out, num_elements);
	err = cudaGetLastError();

	if (err != cudaSuccess) {
	fprintf(stderr, "Failed to launch vector_add kernel (error code: %s)\n", cudaGetErrorString(err));
	exit(EXIT_FAILURE);
	}

	// copy the device result from device memory to host memory
	err = cudaMemcpy(out.data(), device_out, size, cudaMemcpyDeviceToHost);

	if (err != cudaSuccess) {
	fprintf(stderr, "Failed to copy result from device to host (error code: %s)\n", cudaGetErrorString(err));
	exit(EXIT_FAILURE);
	}

	for (int i = 0; i < num_elements; i++) {
	assert(buf[i] + 1000 == out[i]);
	}

	printf("Tests passed!\n");

	cudaFree(device_buf);
	cudaFree(device_out);

	printf("Done\n");

	return 0;
	}
	#include <cassert>
	#include <chrono>
	#include <cmath>
	#include <cstdio>
	#include <vector>

	#include <gdal/gdal_priv.h>

	__global__ void
	vector_add(const int16_t* in, float* out, int num_elements, int ox, int oy, int res)
	{
	int x = blockDim.x * blockIdx.x + threadIdx.x;
	int y = blockDim.y * blockIdx.y + threadIdx.y;

	int i = y * (gridDim.x * blockDim.x) + x;
	if (i < num_elements) {
	float distance = res * std::sqrt(std::pow(std::abs(ox - x), 2) + std::pow(std::abs(oy - y), 2));
	out[i] = distance;
	}
	}

	int
	main()
	{
	GDALAllRegister();

	GDALDataset dataset = (GDALDataset) GDALOpen("data/N33W099.hgt", GA_ReadOnly);
	GDALRasterBand *band = dataset->GetRasterBand(1);

	int x_size = band->GetXSize();
	int y_size = band->GetYSize();
	int num_elements = x_size * y_size;
	int size = num_elements * sizeof(int16_t);

	std::vector<int16_t> buf(num_elements);
	std::vector<float> out(num_elements);

	auto gdal_err = band->RasterIO(GF_Read, 0, 0, x_size, y_size, buf.data(), x_size, y_size, GDT_Int16, 0, 0);

	if (gdal_err) {
	fprintf(stderr, "Failed to read dataset with GDAL\n");
	exit(EXIT_FAILURE);
	}

	GDALClose((GDALDatasetH) dataset);

	// END GDAL CODE

	printf("Processing %d elements\n", num_elements);

	cudaError_t err = cudaSuccess;

	// allocate device input
	int16_t *device_buf = nullptr;
	err = cudaMalloc((void **)&device_buf, size);

	if (err != cudaSuccess) {
	fprintf(stderr, "Failed to allocate device buffer (error code: %s)\n", cudaGetErrorString(err));
	exit(EXIT_FAILURE);
	}

	// allocate device output buffer
	float *device_out = nullptr;
	err = cudaMalloc((void *)&device_out, num_elements sizeof(float));

	if (err != cudaSuccess) {
	fprintf(stderr, "Failed to allocate device output buffer (error code: %s)\n", cudaGetErrorString(err));
	exit(EXIT_FAILURE);
	}

	// copy input buf from host to device
	err = cudaMemcpy(device_buf, buf.data(), size, cudaMemcpyHostToDevice);

	if (err != cudaSuccess) {
	fprintf(stderr, "Failed to copy buf from host to device (error code: %s)\n", cudaGetErrorString(err));
	exit(EXIT_FAILURE);
	}

	dim3 threads_per_block(16, 16);
	dim3 blocks_per_grid(1 + x_size / threads_per_block.x, 1 + y_size / threads_per_block.y);

	auto start = std::chrono::high_resolution_clock::now();

	vector_add<<<blocks_per_grid, threads_per_block>>>(device_buf, device_out, num_elements, x_size / 2, y_size / 2, 30);

	auto dur = std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::high_resolution_clock::now() - start);
	printf("%fus\n", dur * 1'000'000);

	err = cudaGetLastError();

	if (err != cudaSuccess) {
	fprintf(stderr, "Failed to launch vector_add kernel (error code: %s)\n", cudaGetErrorString(err));
	exit(EXIT_FAILURE);
	}

	// copy the device result from device memory to host memory
	err = cudaMemcpy(out.data(), device_out, size, cudaMemcpyDeviceToHost);

	if (err != cudaSuccess) {
	fprintf(stderr, "Failed to copy result from device to host (error code: %s)\n", cudaGetErrorString(err));
	exit(EXIT_FAILURE);
	}

	cudaFree(device_buf);
	cudaFree(device_out);

	printf("Done\n");

	return 0;
	}