timshen91/gist:9ba7612f877839cdd346199e22efca6a

## gistfile1.txt
#include <cuda.h>
#include <cuda_fp16.h>

__device__ inline void mma_fp16_acc_fp16(unsigned const *A, unsigned const *B,
                                         unsigned const *C, unsigned *D) {
  asm volatile(
      "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 {%0,%1}, "
      "{%2,%3}, {%4}, {%5,%6};\n"
      : "=r"(D[0]), "=r"(D[1])
      : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(C[0]), "r"(C[1]));
}

__device__ inline void mma_fp16_acc_fp32(unsigned const *A, unsigned const *B,
                                         float const *C, float *D) {
  asm volatile(
      "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, "
      "{%4,%5}, {%6}, {%7,%8,%9,%10};\n"
      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
      : "r"(A[0]), "r"(A[1]), "r"(B[0]), "f"(C[0]), "f"(C[1]), "f"(C[2]),
        "f"(C[3]));
}

__global__ void test_acc_fp16(void *out) {
  unsigned A[2], B[1], C[2] = {0, 0};

  __half values[2] = {42., 42.};
  memcpy(A, values, 4);
  memcpy(B, values, 2);
  for (int i = 0; i < 204800; i++) {
    mma_fp16_acc_fp16(A, B, C, C);
  }
  memcpy(out, C, 4);
}

__global__ void test_acc_fp16_with_f32_output(void *out) {
  unsigned A[2], B[1];
  float C[4] = {0, 0, 0, 0};

  __half values[2] = {42., 42.};
  memcpy(A, values, 4);
  memcpy(B, values, 2);
  for (int i = 0; i < 200; i++) {
    __half tmp[4] = {0, 0, 0, 0};
    for (int j = 0; j < 1024; j++) {
      mma_fp16_acc_fp16(A, B, (unsigned *)tmp, (unsigned *)tmp);
    }
    C[0] += float(tmp[0]);
    C[1] += float(tmp[1]);
    C[2] += float(tmp[2]);
    C[3] += float(tmp[3]);
  }
  memcpy(out, C, 4 * 4);
}

__global__ void test_acc_fp32(void *out) {
  unsigned A[2], B[1];
  float C[4] = {0, 0, 0, 0};

  __half values[2] = {42., 42.};
  memcpy(A, values, 4);
  memcpy(B, values, 2);
  for (int i = 0; i < 204800; i++) {
    mma_fp16_acc_fp32(A, B, C, C);
  }
  memcpy(out, C, 4 * 4);
}

int main() {
  void *buf;
  cudaMalloc(&buf, 128);
  for (int i = 0; i < 1; i++) {
    test_acc_fp16_with_f32_output<<<401408, 128>>>(buf);
  }
  cudaDeviceSynchronize();
}

// Wall times are one-shot, with unmeasured jitters (but within the < .5s ballpark).

// Machine 1, TITAN RTX benchmark results (process wall time):
// test_acc_fp16: 5.604 seconds
// test_acc_fp16_with_f32_output: 5.546 seconds
// test_acc_fp32: 5.536 seconds

// Machine 2, RTX 2080 Ti benchmark results (process wall time):
// test_acc_fp16: 5.283 seconds
// test_acc_fp16_with_f32_output: 5.252 seconds
// test_acc_fp32: 10.288 seconds

// Machine 2, RTX 3090 benchmark results (process wall time):
// test_acc_fp16: 4.346 seconds
// test_acc_fp16_with_f32_output: 4.360 seconds
// test_acc_fp32: 8.510 seconds
	#include <cuda.h>
	#include <cuda_fp16.h>

	__device__ inline void mma_fp16_acc_fp16(unsigned const A, unsigned const B,
	unsigned const C, unsigned D) {
	asm volatile(
	"mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 {%0,%1}, "
	"{%2,%3}, {%4}, {%5,%6};\n"
	: "=r"(D[0]), "=r"(D[1])
	: "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(C[0]), "r"(C[1]));
	}

	__device__ inline void mma_fp16_acc_fp32(unsigned const A, unsigned const B,
	float const C, float D) {
	asm volatile(
	"mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, "
	"{%4,%5}, {%6}, {%7,%8,%9,%10};\n"
	: "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
	: "r"(A[0]), "r"(A[1]), "r"(B[0]), "f"(C[0]), "f"(C[1]), "f"(C[2]),
	"f"(C[3]));
	}

	__global__ void test_acc_fp16(void *out) {
	unsigned A[2], B[1], C[2] = {0, 0};

	__half values[2] = {42., 42.};
	memcpy(A, values, 4);
	memcpy(B, values, 2);
	for (int i = 0; i < 204800; i++) {
	mma_fp16_acc_fp16(A, B, C, C);
	}
	memcpy(out, C, 4);
	}

	__global__ void test_acc_fp16_with_f32_output(void *out) {
	unsigned A[2], B[1];
	float C[4] = {0, 0, 0, 0};

	__half values[2] = {42., 42.};
	memcpy(A, values, 4);
	memcpy(B, values, 2);
	for (int i = 0; i < 200; i++) {
	__half tmp[4] = {0, 0, 0, 0};
	for (int j = 0; j < 1024; j++) {
	mma_fp16_acc_fp16(A, B, (unsigned )tmp, (unsigned )tmp);
	}
	C[0] += float(tmp[0]);
	C[1] += float(tmp[1]);
	C[2] += float(tmp[2]);
	C[3] += float(tmp[3]);
	}
	memcpy(out, C, 4 * 4);
	}

	__global__ void test_acc_fp32(void *out) {
	unsigned A[2], B[1];
	float C[4] = {0, 0, 0, 0};

	__half values[2] = {42., 42.};
	memcpy(A, values, 4);
	memcpy(B, values, 2);
	for (int i = 0; i < 204800; i++) {
	mma_fp16_acc_fp32(A, B, C, C);
	}
	memcpy(out, C, 4 * 4);
	}

	int main() {
	void *buf;
	cudaMalloc(&buf, 128);
	for (int i = 0; i < 1; i++) {
	test_acc_fp16_with_f32_output<<<401408, 128>>>(buf);
	}
	cudaDeviceSynchronize();
	}

	// Wall times are one-shot, with unmeasured jitters (but within the < .5s ballpark).

	// Machine 1, TITAN RTX benchmark results (process wall time):
	// test_acc_fp16: 5.604 seconds
	// test_acc_fp16_with_f32_output: 5.546 seconds
	// test_acc_fp32: 5.536 seconds

	// Machine 2, RTX 2080 Ti benchmark results (process wall time):
	// test_acc_fp16: 5.283 seconds
	// test_acc_fp16_with_f32_output: 5.252 seconds
	// test_acc_fp32: 10.288 seconds

	// Machine 2, RTX 3090 benchmark results (process wall time):
	// test_acc_fp16: 4.346 seconds
	// test_acc_fp16_with_f32_output: 4.360 seconds
	// test_acc_fp32: 8.510 seconds