jlebar/test.cu

## test.cu
#include <stdio.h>
#include <vector>

static constexpr int FULL_MASK = 31;

__global__ void test(const int* in, int *out) {
  printf("threadIdx.x=%d, blockIdx.x=%d\n", threadIdx.x, blockIdx.x);
  if (in[threadIdx.x] == 0) {
    out[2 * threadIdx.x] = __activemask();
    __syncwarp(FULL_MASK);
    out[2 * threadIdx.x + 1] = __activemask();
  } else {
    out[2 * threadIdx.x] = -__activemask();
    __syncwarp(FULL_MASK);
    out[2 * threadIdx.x + 1] = -__activemask();
  }
}

#define CHECK(x)                                 \
  if (cudaError_t err = (x); err != cudaSuccess) \
  printf("FAILED at line %d: %d\n", __LINE__, err)

int main() {
  int* in;
  CHECK(cudaMalloc(&in, 32 * sizeof(int)));

  std::vector<int> host_in(32);
  for (int i = 0; i < 32; i++) {
    host_in[i] = i % 2;
  }
  CHECK(cudaMemcpy(in, host_in.data(), 32 * sizeof(int), cudaMemcpyHostToDevice));

  int* out;
  CHECK(cudaMalloc(&out, 64 * sizeof(int)));
  CHECK(cudaDeviceSynchronize());

  printf("Starting kernel...\n");
  test<<<1, 32>>>(in, out);
  CHECK(cudaDeviceSynchronize());
  printf("Kernel finished.\n");

  std::vector<int> host_out(64);
  cudaMemcpy(host_out.data(), out, 64 * sizeof(int), cudaMemcpyDeviceToHost);
  printf("Kernel output:\n");
  for (int i = 0; i < 64; i++) {
    printf("out[%d] = %d\n", i, host_out[i]);
  }
}
	#include <stdio.h>
	#include <vector>

	static constexpr int FULL_MASK = 31;

	__global__ void test(const int* in, int *out) {
	printf("threadIdx.x=%d, blockIdx.x=%d\n", threadIdx.x, blockIdx.x);
	if (in[threadIdx.x] == 0) {
	out[2 * threadIdx.x] = __activemask();
	__syncwarp(FULL_MASK);
	out[2 * threadIdx.x + 1] = __activemask();
	} else {
	out[2 * threadIdx.x] = -__activemask();
	__syncwarp(FULL_MASK);
	out[2 * threadIdx.x + 1] = -__activemask();
	}
	}

	#define CHECK(x) \
	if (cudaError_t err = (x); err != cudaSuccess) \
	printf("FAILED at line %d: %d\n", __LINE__, err)

	int main() {
	int* in;
	CHECK(cudaMalloc(&in, 32 * sizeof(int)));

	std::vector<int> host_in(32);
	for (int i = 0; i < 32; i++) {
	host_in[i] = i % 2;
	}
	CHECK(cudaMemcpy(in, host_in.data(), 32 * sizeof(int), cudaMemcpyHostToDevice));

	int* out;
	CHECK(cudaMalloc(&out, 64 * sizeof(int)));
	CHECK(cudaDeviceSynchronize());

	printf("Starting kernel...\n");
	test<<<1, 32>>>(in, out);
	CHECK(cudaDeviceSynchronize());
	printf("Kernel finished.\n");

	std::vector<int> host_out(64);
	cudaMemcpy(host_out.data(), out, 64 * sizeof(int), cudaMemcpyDeviceToHost);
	printf("Kernel output:\n");
	for (int i = 0; i < 64; i++) {
	printf("out[%d] = %d\n", i, host_out[i]);
	}
	}