Skip to content

Instantly share code, notes, and snippets.

@jlebar
Created October 29, 2020 06:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jlebar/50d1b5fedc926c879a64436229c1cc05 to your computer and use it in GitHub Desktop.
Save jlebar/50d1b5fedc926c879a64436229c1cc05 to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <vector>
static constexpr int FULL_MASK = 31;
__global__ void test(const int* in, int *out) {
printf("threadIdx.x=%d, blockIdx.x=%d\n", threadIdx.x, blockIdx.x);
if (in[threadIdx.x] == 0) {
out[2 * threadIdx.x] = __activemask();
__syncwarp(FULL_MASK);
out[2 * threadIdx.x + 1] = __activemask();
} else {
out[2 * threadIdx.x] = -__activemask();
__syncwarp(FULL_MASK);
out[2 * threadIdx.x + 1] = -__activemask();
}
}
#define CHECK(x) \
if (cudaError_t err = (x); err != cudaSuccess) \
printf("FAILED at line %d: %d\n", __LINE__, err)
int main() {
int* in;
CHECK(cudaMalloc(&in, 32 * sizeof(int)));
std::vector<int> host_in(32);
for (int i = 0; i < 32; i++) {
host_in[i] = i % 2;
}
CHECK(cudaMemcpy(in, host_in.data(), 32 * sizeof(int), cudaMemcpyHostToDevice));
int* out;
CHECK(cudaMalloc(&out, 64 * sizeof(int)));
CHECK(cudaDeviceSynchronize());
printf("Starting kernel...\n");
test<<<1, 32>>>(in, out);
CHECK(cudaDeviceSynchronize());
printf("Kernel finished.\n");
std::vector<int> host_out(64);
cudaMemcpy(host_out.data(), out, 64 * sizeof(int), cudaMemcpyDeviceToHost);
printf("Kernel output:\n");
for (int i = 0; i < 64; i++) {
printf("out[%d] = %d\n", i, host_out[i]);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment