Skip to content

Instantly share code, notes, and snippets.

@rinfz
Last active June 11, 2019 19:31
Show Gist options
  • Save rinfz/d39e23699f4fdacae9642a43dce7ee5e to your computer and use it in GitHub Desktop.
Save rinfz/d39e23699f4fdacae9642a43dce7ee5e to your computer and use it in GitHub Desktop.
GDAL + CUDA - (a.cu: basic addition kernel) (b.cu: trig distance from an origin point)
#include <cassert>
#include <cstdio>
#include <vector>
#include <gdal/gdal_priv.h>
__global__ void
vector_add(const int16_t* in, int16_t* out, int num_elements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < num_elements) {
out[i] = in[i] + 1000;
}
}
int
main()
{
GDALAllRegister();
GDALDataset *dataset = (GDALDataset*) GDALOpen("data/N33W099.hgt", GA_ReadOnly);
GDALRasterBand *band = dataset->GetRasterBand(1);
int x_size = band->GetXSize();
int y_size = band->GetYSize();
int num_elements = x_size * y_size;
int size = num_elements * sizeof(int16_t);
std::vector<int16_t> buf(num_elements);
std::vector<int16_t> out(num_elements);
auto gdal_err = band->RasterIO(GF_Read, 0, 0, x_size, y_size, buf.data(), x_size, y_size, GDT_Int16, 0, 0);
if (gdal_err) {
fprintf(stderr, "Failed to read dataset with GDAL\n");
exit(EXIT_FAILURE);
}
GDALClose((GDALDatasetH) dataset);
// END GDAL CODE
cudaError_t err = cudaSuccess;
// allocate device input
int16_t *device_buf = nullptr;
err = cudaMalloc((void **)&device_buf, size);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to allocate device buffer (error code: %s)\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// allocate device output buffer
int16_t *device_out = nullptr;
err = cudaMalloc((void **)&device_out, size);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to allocate device output buffer (error code: %s)\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// copy input buf from host to device
err = cudaMemcpy(device_buf, buf.data(), size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to copy buf from host to device (error code: %s)\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
int threads_per_block = 256;
int blocks_per_grid = (num_elements + threads_per_block - 1) / threads_per_block;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocks_per_grid, threads_per_block);
vector_add<<<blocks_per_grid, threads_per_block>>>(device_buf, device_out, num_elements);
err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "Failed to launch vector_add kernel (error code: %s)\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// copy the device result from device memory to host memory
err = cudaMemcpy(out.data(), device_out, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to copy result from device to host (error code: %s)\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
for (int i = 0; i < num_elements; i++) {
assert(buf[i] + 1000 == out[i]);
}
printf("Tests passed!\n");
cudaFree(device_buf);
cudaFree(device_out);
printf("Done\n");
return 0;
}
#include <cassert>
#include <chrono>
#include <cmath>
#include <cstdio>
#include <vector>
#include <gdal/gdal_priv.h>
__global__ void
vector_add(const int16_t* in, float* out, int num_elements, int ox, int oy, int res)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = y * (gridDim.x * blockDim.x) + x;
if (i < num_elements) {
float distance = res * std::sqrt(std::pow(std::abs(ox - x), 2) + std::pow(std::abs(oy - y), 2));
out[i] = distance;
}
}
int
main()
{
GDALAllRegister();
GDALDataset *dataset = (GDALDataset*) GDALOpen("data/N33W099.hgt", GA_ReadOnly);
GDALRasterBand *band = dataset->GetRasterBand(1);
int x_size = band->GetXSize();
int y_size = band->GetYSize();
int num_elements = x_size * y_size;
int size = num_elements * sizeof(int16_t);
std::vector<int16_t> buf(num_elements);
std::vector<float> out(num_elements);
auto gdal_err = band->RasterIO(GF_Read, 0, 0, x_size, y_size, buf.data(), x_size, y_size, GDT_Int16, 0, 0);
if (gdal_err) {
fprintf(stderr, "Failed to read dataset with GDAL\n");
exit(EXIT_FAILURE);
}
GDALClose((GDALDatasetH) dataset);
// END GDAL CODE
printf("Processing %d elements\n", num_elements);
cudaError_t err = cudaSuccess;
// allocate device input
int16_t *device_buf = nullptr;
err = cudaMalloc((void **)&device_buf, size);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to allocate device buffer (error code: %s)\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// allocate device output buffer
float *device_out = nullptr;
err = cudaMalloc((void **)&device_out, num_elements * sizeof(float));
if (err != cudaSuccess) {
fprintf(stderr, "Failed to allocate device output buffer (error code: %s)\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// copy input buf from host to device
err = cudaMemcpy(device_buf, buf.data(), size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to copy buf from host to device (error code: %s)\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
dim3 threads_per_block(16, 16);
dim3 blocks_per_grid(1 + x_size / threads_per_block.x, 1 + y_size / threads_per_block.y);
auto start = std::chrono::high_resolution_clock::now();
vector_add<<<blocks_per_grid, threads_per_block>>>(device_buf, device_out, num_elements, x_size / 2, y_size / 2, 30);
auto dur = std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::high_resolution_clock::now() - start);
printf("%fus\n", dur * 1'000'000);
err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "Failed to launch vector_add kernel (error code: %s)\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// copy the device result from device memory to host memory
err = cudaMemcpy(out.data(), device_out, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to copy result from device to host (error code: %s)\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
cudaFree(device_buf);
cudaFree(device_out);
printf("Done\n");
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment