This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cuda/runtime_api.hpp> | |
__device__ float pirate_value; | |
__global__ void pirate_float_non_assembler(char operation, int num_operations) { | |
float acc{}; | |
switch (operation) { | |
case 'a': | |
for (int i = 0; i < num_operations; ++i) { | |
acc += threadIdx.x; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstdio> | |
// compile with: nvcc -g -G -std=c++17 test-compute-sanitizer-racecheck-with-sync-intrinsic.cu -arch sm_75 -o test-compute-sanitizer-racecheck-with-sync-intrinsic | |
// execute with: compute-sanitizer --tool racecheck ./test-compute-sanitizer-racecheck-with-sync-intrinsic | |
// | |
// The example at https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-examples-broadcast | |
// mentions that the threads are "synchronized" in the comment behind the function call to `shfl_sync`. | |
__managed__ float result[1]; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import numpy as np | |
import matplotlib.pyplot as plt | |
x0 = -10.0 | |
x1 = 10.0 | |
N = 1000 | |
dx = (x1 - x0) / N |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// compile with `nvcc -std=c++17 -arch sm_61 --expt-relaxed-constexpr main.cu -o main` | |
#include <cfloat> | |
#include <cstdio> | |
#include <iostream> | |
__global__ void kernel(float* values, int num_values) { | |
const int tid = threadIdx.x; | |
const int G = blockDim.x; | |
float r = 0; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cmake_minimum_required(VERSION 3.18) | |
project(cuda-clang LANGUAGES CXX CUDA) | |
find_package(CMakeshift REQUIRED) | |
#find_package(gsl-lite REQUIRED) | |
find_package(cuda-api-wrappers REQUIRED) | |
add_executable(cuda-clang "cuda-clang.cu") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// compile with `nvcc -std=c++17 -rdc=true --extended-lambda dynamic-parallelism-with-thrust-and-multiple-streams.cu -o dpe` | |
#include <iostream> | |
#include <thrust/device_vector.h> | |
#include <thrust/execution_policy.h> | |
#include <thrust/for_each.h> | |
#include <thrust/iterator/counting_iterator.h> | |
struct do_something { | |
__device__ do_something(float* ptr) : ptr_(ptr) {} |