Last active
March 3, 2022 19:26
-
-
Save timshen91/9ba7612f877839cdd346199e22efca6a to your computer and use it in GitHub Desktop.
Benchmarking Turing MMA instructions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cuda.h> | |
#include <cuda_fp16.h> | |
__device__ inline void mma_fp16_acc_fp16(unsigned const *A, unsigned const *B, | |
unsigned const *C, unsigned *D) { | |
asm volatile( | |
"mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 {%0,%1}, " | |
"{%2,%3}, {%4}, {%5,%6};\n" | |
: "=r"(D[0]), "=r"(D[1]) | |
: "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(C[0]), "r"(C[1])); | |
} | |
__device__ inline void mma_fp16_acc_fp32(unsigned const *A, unsigned const *B, | |
float const *C, float *D) { | |
asm volatile( | |
"mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, " | |
"{%4,%5}, {%6}, {%7,%8,%9,%10};\n" | |
: "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3]) | |
: "r"(A[0]), "r"(A[1]), "r"(B[0]), "f"(C[0]), "f"(C[1]), "f"(C[2]), | |
"f"(C[3])); | |
} | |
__global__ void test_acc_fp16(void *out) { | |
unsigned A[2], B[1], C[2] = {0, 0}; | |
__half values[2] = {42., 42.}; | |
memcpy(A, values, 4); | |
memcpy(B, values, 2); | |
for (int i = 0; i < 204800; i++) { | |
mma_fp16_acc_fp16(A, B, C, C); | |
} | |
memcpy(out, C, 4); | |
} | |
__global__ void test_acc_fp16_with_f32_output(void *out) { | |
unsigned A[2], B[1]; | |
float C[4] = {0, 0, 0, 0}; | |
__half values[2] = {42., 42.}; | |
memcpy(A, values, 4); | |
memcpy(B, values, 2); | |
for (int i = 0; i < 200; i++) { | |
__half tmp[4] = {0, 0, 0, 0}; | |
for (int j = 0; j < 1024; j++) { | |
mma_fp16_acc_fp16(A, B, (unsigned *)tmp, (unsigned *)tmp); | |
} | |
C[0] += float(tmp[0]); | |
C[1] += float(tmp[1]); | |
C[2] += float(tmp[2]); | |
C[3] += float(tmp[3]); | |
} | |
memcpy(out, C, 4 * 4); | |
} | |
__global__ void test_acc_fp32(void *out) { | |
unsigned A[2], B[1]; | |
float C[4] = {0, 0, 0, 0}; | |
__half values[2] = {42., 42.}; | |
memcpy(A, values, 4); | |
memcpy(B, values, 2); | |
for (int i = 0; i < 204800; i++) { | |
mma_fp16_acc_fp32(A, B, C, C); | |
} | |
memcpy(out, C, 4 * 4); | |
} | |
int main() { | |
void *buf; | |
cudaMalloc(&buf, 128); | |
for (int i = 0; i < 1; i++) { | |
test_acc_fp16_with_f32_output<<<401408, 128>>>(buf); | |
} | |
cudaDeviceSynchronize(); | |
} | |
// Wall times are one-shot, with unmeasured jitters (but within the < .5s ballpark). | |
// Machine 1, TITAN RTX benchmark results (process wall time): | |
// test_acc_fp16: 5.604 seconds | |
// test_acc_fp16_with_f32_output: 5.546 seconds | |
// test_acc_fp32: 5.536 seconds | |
// Machine 2, RTX 2080 Ti benchmark results (process wall time): | |
// test_acc_fp16: 5.283 seconds | |
// test_acc_fp16_with_f32_output: 5.252 seconds | |
// test_acc_fp32: 10.288 seconds | |
// Machine 2, RTX 3090 benchmark results (process wall time): | |
// test_acc_fp16: 4.346 seconds | |
// test_acc_fp16_with_f32_output: 4.360 seconds | |
// test_acc_fp32: 8.510 seconds |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment