Skip to content

Instantly share code, notes, and snippets.

@timshen91
Last active March 3, 2022 19:26
Show Gist options
  • Save timshen91/9ba7612f877839cdd346199e22efca6a to your computer and use it in GitHub Desktop.
Save timshen91/9ba7612f877839cdd346199e22efca6a to your computer and use it in GitHub Desktop.
Benchmarking Turing MMA instructions
#include <cuda.h>
#include <cuda_fp16.h>
__device__ inline void mma_fp16_acc_fp16(unsigned const *A, unsigned const *B,
unsigned const *C, unsigned *D) {
asm volatile(
"mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 {%0,%1}, "
"{%2,%3}, {%4}, {%5,%6};\n"
: "=r"(D[0]), "=r"(D[1])
: "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(C[0]), "r"(C[1]));
}
__device__ inline void mma_fp16_acc_fp32(unsigned const *A, unsigned const *B,
float const *C, float *D) {
asm volatile(
"mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, "
"{%4,%5}, {%6}, {%7,%8,%9,%10};\n"
: "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
: "r"(A[0]), "r"(A[1]), "r"(B[0]), "f"(C[0]), "f"(C[1]), "f"(C[2]),
"f"(C[3]));
}
__global__ void test_acc_fp16(void *out) {
unsigned A[2], B[1], C[2] = {0, 0};
__half values[2] = {42., 42.};
memcpy(A, values, 4);
memcpy(B, values, 2);
for (int i = 0; i < 204800; i++) {
mma_fp16_acc_fp16(A, B, C, C);
}
memcpy(out, C, 4);
}
__global__ void test_acc_fp16_with_f32_output(void *out) {
unsigned A[2], B[1];
float C[4] = {0, 0, 0, 0};
__half values[2] = {42., 42.};
memcpy(A, values, 4);
memcpy(B, values, 2);
for (int i = 0; i < 200; i++) {
__half tmp[4] = {0, 0, 0, 0};
for (int j = 0; j < 1024; j++) {
mma_fp16_acc_fp16(A, B, (unsigned *)tmp, (unsigned *)tmp);
}
C[0] += float(tmp[0]);
C[1] += float(tmp[1]);
C[2] += float(tmp[2]);
C[3] += float(tmp[3]);
}
memcpy(out, C, 4 * 4);
}
__global__ void test_acc_fp32(void *out) {
unsigned A[2], B[1];
float C[4] = {0, 0, 0, 0};
__half values[2] = {42., 42.};
memcpy(A, values, 4);
memcpy(B, values, 2);
for (int i = 0; i < 204800; i++) {
mma_fp16_acc_fp32(A, B, C, C);
}
memcpy(out, C, 4 * 4);
}
int main() {
void *buf;
cudaMalloc(&buf, 128);
for (int i = 0; i < 1; i++) {
test_acc_fp16_with_f32_output<<<401408, 128>>>(buf);
}
cudaDeviceSynchronize();
}
// Wall times are one-shot, with unmeasured jitters (but within the < .5s ballpark).
// Machine 1, TITAN RTX benchmark results (process wall time):
// test_acc_fp16: 5.604 seconds
// test_acc_fp16_with_f32_output: 5.546 seconds
// test_acc_fp32: 5.536 seconds
// Machine 2, RTX 2080 Ti benchmark results (process wall time):
// test_acc_fp16: 5.283 seconds
// test_acc_fp16_with_f32_output: 5.252 seconds
// test_acc_fp32: 10.288 seconds
// Machine 2, RTX 3090 benchmark results (process wall time):
// test_acc_fp16: 4.346 seconds
// test_acc_fp16_with_f32_output: 4.360 seconds
// test_acc_fp32: 8.510 seconds
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment