Skip to content

Instantly share code, notes, and snippets.

@mratsim
Created December 14, 2022 10:04
Show Gist options
  • Save mratsim/0e1f1454d549e81312a5fb97ac717b7a to your computer and use it in GitHub Desktop.
Save mratsim/0e1f1454d549e81312a5fb97ac717b7a to your computer and use it in GitHub Desktop.
Uint256 on Nvidia, codegen quality investigation
; ModuleID = 'build/nvidia/add_carry.cu'
source_filename = "build/nvidia/add_carry.cu"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%printf_args = type { i64 }
%printf_args.0 = type { i64 }
@.str = private unnamed_addr constant [27 x i8] c"32-bit Addition: %#016llx\0A\00", align 1
@.str1 = private unnamed_addr constant [27 x i8] c"64-bit Addition: %#016llx\0A\00", align 1
; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone
define dso_local void @_Z13addcKernelv() #0 {
%1 = alloca i64, align 8
%2 = alloca i64, align 8
%3 = alloca %printf_args, align 8
%4 = alloca %printf_args.0, align 8
store i64 0, i64* %1, align 8
store i64 0, i64* %2, align 8
%5 = call i64 asm ".reg .b32 r0;\0A\09.reg .b32 r1;\0A\09add.cc.u32 r0, 0xc2775652, 0x4c60baa8;\0A\09addc.cc.u32 r1, 0xa64ab78d, 0xb1da3ab6;\0A\09mov.b64 $0, {r0, r1}\0A\09;", "=l"() #1, !srcloc !7
store i64 %5, i64* %1, align 8
%6 = call i64 asm "add.u64 $0, 0xa64ab78dc2775652, 0xb1da3ab64c60baa8;\0A\09", "=l"() #1, !srcloc !8
store i64 %6, i64* %2, align 8
%7 = load i64, i64* %1, align 8
%8 = getelementptr inbounds %printf_args, %printf_args* %3, i32 0, i32 0
store i64 %7, i64* %8, align 8
%9 = bitcast %printf_args* %3 to i8*
%10 = call i32 @vprintf(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @.str, i64 0, i64 0), i8* %9)
%11 = load i64, i64* %2, align 8
%12 = getelementptr inbounds %printf_args.0, %printf_args.0* %4, i32 0, i32 0
store i64 %11, i64* %12, align 8
%13 = bitcast %printf_args.0* %4 to i8*
%14 = call i32 @vprintf(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @.str1, i64 0, i64 0), i8* %13)
ret void
}
declare i32 @vprintf(i8*, i8*)
attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_86" "target-features"="+ptx75,+sm_86" }
attributes #1 = { convergent nounwind readnone }
!llvm.module.flags = !{!0, !1, !2, !3}
!nvvm.annotations = !{!4}
!llvm.ident = !{!5, !6}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 5]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{i32 7, !"frame-pointer", i32 2}
!4 = !{void ()* @_Z13addcKernelv, !"kernel", i32 1}
!5 = !{!"clang version 14.0.6"}
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
!7 = !{i64 229, i64 245, i64 267, i64 314, i64 362, i64 391}
!8 = !{i64 430, i64 484}
// Compile with
// NVCC:
// nvcc -arch=sm_86 -ptx build/nvidia/add_carry.cu -o build/nvidia/add_carry_nvcc.ptx
// Clang/LLVM with NVPTX backend
// clang++ -S -emit-llvm \
// build/nvidia/add_carry.cu \
// --cuda-gpu-arch=sm_86 \
// -L/opt/cuda/lib64 \
// -lcudart_static -ldl -lrt -pthread
// clang++ build/nvidia/add_carry.cu \
// -o build/nvidia/add_carry \
// --cuda-gpu-arch=sm_86 \
// -L/opt/cuda/lib64 \
// -lcudart_static -ldl -lrt -pthread
// llc -mcpu=sm_86 build/nvidia/add_carry-cuda-nvptx64-nvidia-cuda-sm_86.ll -o build/nvidia/add_carry_llvm.ptx
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cstdint>
#include <stdio.h>
cudaError_t addc();
__global__ void addcKernel()
{
uint64_t result32bitAdd = 0;
uint64_t result64bitAdd = 0;
asm(".reg .b32 r0;\n\t"
".reg .b32 r1;\n\t"
"add.cc.u32 r0, 0xc2775652, 0x4c60baa8;\n\t"
"addc.cc.u32 r1, 0xa64ab78d, 0xb1da3ab6;\n\t"
"mov.b64 %0, {r0, r1}\n\t;"
: "=l"(result32bitAdd));
asm("add.u64 %0, 0xa64ab78dc2775652, 0xb1da3ab64c60baa8;\n\t"
: "=l"(result64bitAdd));
printf("32-bit Addition: %#016llx\n", result32bitAdd);
printf("64-bit Addition: %#016llx\n", result64bitAdd);
}
int main()
{
cudaError_t cudaStatus = addc();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
getchar();
return 0;
}
cudaError_t addc()
{
cudaError_t cudaStatus;
cudaStatus = cudaSetDevice(0);
addcKernel <<<1, 1>>>();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
Error:
return cudaStatus;
}
//
// Generated by LLVM NVPTX Back-End
//
.version 7.1
.target sm_86
.address_size 64
// .globl _Z13addcKernelv // -- Begin function _Z13addcKernelv
.extern .func (.param .b32 func_retval0) vprintf
(
.param .b64 vprintf_param_0,
.param .b64 vprintf_param_1
)
;
.global .align 1 .b8 _$_str[27] = {51, 50, 45, 98, 105, 116, 32, 65, 100, 100, 105, 116, 105, 111, 110, 58, 32, 37, 35, 48, 49, 54, 108, 108, 120, 10, 0};
.global .align 1 .b8 _$_str1[27] = {54, 52, 45, 98, 105, 116, 32, 65, 100, 100, 105, 116, 105, 111, 110, 58, 32, 37, 35, 48, 49, 54, 108, 108, 120, 10, 0};
// @_Z13addcKernelv
.visible .entry _Z13addcKernelv()
{
.local .align 8 .b8 __local_depot0[32];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .b32 %r<5>;
.reg .b64 %rd<12>;
// %bb.0:
mov.u64 %SPL, __local_depot0;
cvta.local.u64 %SP, %SPL;
mov.u64 %rd3, 0;
st.u64 [%SP+0], %rd3;
st.u64 [%SP+8], %rd3;
// begin inline asm
.reg .b32 r0;
.reg .b32 r1;
add.cc.u32 r0, 0xc2775652, 0x4c60baa8;
addc.cc.u32 r1, 0xa64ab78d, 0xb1da3ab6;
mov.b64 %rd1, {r0, r1}
;
// end inline asm
st.u64 [%SP+0], %rd1;
// begin inline asm
add.u64 %rd2, 0xa64ab78dc2775652, 0xb1da3ab64c60baa8;
// end inline asm
st.u64 [%SP+8], %rd2;
ld.u64 %rd4, [%SP+0];
st.u64 [%SP+16], %rd4;
mov.u64 %rd5, _$_str;
cvta.global.u64 %rd6, %rd5;
add.u64 %rd7, %SP, 16;
{ // callseq 0, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd6;
.param .b64 param1;
st.param.b64 [param1+0], %rd7;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r1, [retval0+0];
} // callseq 0
ld.u64 %rd8, [%SP+8];
st.u64 [%SP+24], %rd8;
mov.u64 %rd9, _$_str1;
cvta.global.u64 %rd10, %rd9;
add.u64 %rd11, %SP, 24;
{ // callseq 1, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd10;
.param .b64 param1;
st.param.b64 [param1+0], %rd11;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r3, [retval0+0];
} // callseq 1
ret;
// -- End function
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-31833905
// Cuda compilation tools, release 11.8, V11.8.89
// Based on NVVM 7.0.1
//
.version 7.8
.target sm_86
.address_size 64
// .globl _Z10addcKernelv
.extern .func (.param .b32 func_retval0) vprintf
(
.param .b64 vprintf_param_0,
.param .b64 vprintf_param_1
)
;
.global .align 1 .b8 $str[27] = {51, 50, 45, 98, 105, 116, 32, 65, 100, 100, 105, 116, 105, 111, 110, 58, 32, 37, 35, 48, 49, 54, 108, 108, 120, 10, 0};
.global .align 1 .b8 $str$1[27] = {54, 52, 45, 98, 105, 116, 32, 65, 100, 100, 105, 116, 105, 111, 110, 58, 32, 37, 35, 48, 49, 54, 108, 108, 120, 10, 0};
.visible .entry _Z10addcKernelv()
{
.local .align 8 .b8 __local_depot0[8];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .b32 %r<3>;
.reg .b64 %rd<9>;
mov.u64 %SPL, __local_depot0;
cvta.local.u64 %SP, %SPL;
add.u64 %rd3, %SP, 0;
add.u64 %rd4, %SPL, 0;
// begin inline asm
.reg .b32 r0;
.reg .b32 r1;
add.cc.u32 r0, 0xc2775652, 0x4c60baa8;
addc.cc.u32 r1, 0xa64ab78d, 0xb1da3ab6;
mov.b64 %rd1, {r0, r1}
;
// end inline asm
// begin inline asm
add.u64 %rd2, 0xa64ab78dc2775652, 0xb1da3ab64c60baa8;
// end inline asm
st.local.u64 [%rd4], %rd1;
mov.u64 %rd5, $str;
cvta.global.u64 %rd6, %rd5;
{ // callseq 0, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd6;
.param .b64 param1;
st.param.b64 [param1+0], %rd3;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r1, [retval0+0];
} // callseq 0
st.local.u64 [%rd4], %rd2;
mov.u64 %rd7, $str$1;
cvta.global.u64 %rd8, %rd7;
{ // callseq 1, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd8;
.param .b64 param1;
st.param.b64 [param1+0], %rd3;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r2, [retval0+0];
} // callseq 1
ret;
}
; ModuleID = 'build/nvidia/wideint.cu'
source_filename = "build/nvidia/wideint.cu"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%printf_args = type { i32 }
@.str = private unnamed_addr constant [5 x i8] c"%02X\00", align 1
; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone
define dso_local void @_Z12add256Kernelv() #0 {
%1 = alloca i256, align 8
%2 = alloca i256, align 8
%3 = alloca i256, align 8
%4 = alloca i32, align 4
%5 = alloca %printf_args, align 8
store i256 43520, i256* %1, align 8
store i256 1, i256* %2, align 8
store i256 0, i256* %3, align 8
%6 = load i256, i256* %1, align 8
%7 = load i256, i256* %2, align 8
%8 = add nsw i256 %6, %7
store i256 %8, i256* %3, align 8
store i32 0, i32* %4, align 4
br label %9
9: ; preds = %22, %0
%10 = load i32, i32* %4, align 4
%11 = icmp slt i32 %10, 32
br i1 %11, label %12, label %25
12: ; preds = %9
%13 = bitcast i256* %3 to i8*
%14 = load i32, i32* %4, align 4
%15 = sext i32 %14 to i64
%16 = getelementptr inbounds i8, i8* %13, i64 %15
%17 = load i8, i8* %16, align 1
%18 = zext i8 %17 to i32
%19 = getelementptr inbounds %printf_args, %printf_args* %5, i32 0, i32 0
store i32 %18, i32* %19, align 4
%20 = bitcast %printf_args* %5 to i8*
%21 = call i32 @vprintf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), i8* %20)
br label %22
22: ; preds = %12
%23 = load i32, i32* %4, align 4
%24 = add nsw i32 %23, 1
store i32 %24, i32* %4, align 4
br label %9, !llvm.loop !7
25: ; preds = %9
ret void
}
declare i32 @vprintf(i8*, i8*)
attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_86" "target-features"="+ptx72,+sm_86" }
!llvm.module.flags = !{!0, !1, !2, !3}
!nvvm.annotations = !{!4}
!llvm.ident = !{!5, !6}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 2]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{i32 7, !"frame-pointer", i32 2}
!4 = !{void ()* @_Z12add256Kernelv, !"kernel", i32 1}
!5 = !{!"clang version 13.0.1"}
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
!7 = distinct !{!7, !8}
!8 = !{!"llvm.loop.mustprogress"}
// Compile with LLVM
// /usr/lib/llvm13/bin/clang++ -S -emit-llvm \
// build/nvidia/wideint.cu \
// --cuda-gpu-arch=sm_86 \
// -L/opt/cuda/lib64 \
// -lcudart_static -ldl -lrt -pthread
// /usr/lib/llvm13/bin/clang++ build/nvidia/wideint.cu \
// -o build/nvidia/wideint \
// --cuda-gpu-arch=sm_86 \
// -L/opt/cuda/lib64 \
// -lcudart_static -ldl -lrt -pthread
// llc -mcpu=sm_86 build/nvidia/wideint-cuda-nvptx64-nvidia-cuda-sm_86.ll -o build/nvidia/wideint_llvm.ptx
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cstdint>
#include <stdio.h>
typedef _ExtInt(256) u256;
cudaError_t add256();
__global__ void add256Kernel() {
u256 a = 0xAA00;
u256 b = 0x1;
u256 c = 0;
c = a + b;
for (int i = 0; i < 32; i++) {
printf("%02X", ((unsigned char*)(&c))[i]);
}
}
int main()
{
cudaError_t cudaStatus = add256();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
getchar();
return 0;
}
cudaError_t add256()
{
cudaError_t cudaStatus;
cudaStatus = cudaSetDevice(0);
add256Kernel<<<1, 1>>>();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
Error:
return cudaStatus;
}
//
// Generated by LLVM NVPTX Back-End
//
.version 7.1
.target sm_86
.address_size 64
// .globl _Z12add256Kernelv // -- Begin function _Z12add256Kernelv
.extern .func (.param .b32 func_retval0) vprintf
(
.param .b64 vprintf_param_0,
.param .b64 vprintf_param_1
)
;
.global .align 1 .b8 _$_str[5] = {37, 48, 50, 88, 0};
// @_Z12add256Kernelv
.visible .entry _Z12add256Kernelv()
{
.local .align 8 .b8 __local_depot0[112];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<13>;
.reg .b32 %r<15>;
.reg .b64 %rd<35>;
// %bb.0:
mov.u64 %SPL, __local_depot0;
cvta.local.u64 %SP, %SPL;
mov.u64 %rd1, 0;
st.u64 [%SP+24], %rd1;
st.u64 [%SP+16], %rd1;
st.u64 [%SP+8], %rd1;
mov.u64 %rd2, 43520;
st.u64 [%SP+0], %rd2;
st.u64 [%SP+56], %rd1;
st.u64 [%SP+48], %rd1;
st.u64 [%SP+40], %rd1;
mov.u64 %rd3, 1;
st.u64 [%SP+32], %rd3;
st.u64 [%SP+88], %rd1;
st.u64 [%SP+80], %rd1;
st.u64 [%SP+72], %rd1;
st.u64 [%SP+64], %rd1;
ld.u64 %rd4, [%SP+24];
ld.u64 %rd5, [%SP+16];
ld.u64 %rd6, [%SP+8];
ld.u64 %rd7, [%SP+0];
ld.u64 %rd8, [%SP+56];
ld.u64 %rd9, [%SP+48];
ld.u64 %rd10, [%SP+40];
ld.u64 %rd11, [%SP+32];
add.s64 %rd12, %rd7, %rd11;
setp.lt.u64 %p1, %rd12, %rd11;
setp.lt.u64 %p2, %rd12, %rd7;
selp.u64 %rd13, 1, 0, %p2;
selp.b64 %rd14, 1, %rd13, %p1;
add.s64 %rd15, %rd6, %rd10;
add.s64 %rd16, %rd15, %rd14;
setp.eq.s64 %p3, %rd16, %rd10;
setp.lt.u64 %p4, %rd16, %rd10;
selp.u32 %r1, -1, 0, %p4;
selp.u32 %r2, -1, 0, %p1;
selp.b32 %r3, %r2, %r1, %p3;
and.b32 %r4, %r3, 1;
setp.eq.b32 %p5, %r4, 1;
setp.eq.s64 %p6, %rd16, %rd6;
setp.lt.u64 %p7, %rd16, %rd6;
selp.u32 %r5, -1, 0, %p7;
selp.u32 %r6, -1, 0, %p2;
selp.b32 %r7, %r6, %r5, %p6;
cvt.u64.u32 %rd17, %r7;
and.b64 %rd18, %rd17, 1;
selp.b64 %rd19, 1, %rd18, %p5;
add.s64 %rd20, %rd5, %rd9;
add.s64 %rd21, %rd20, %rd19;
setp.lt.u64 %p8, %rd21, %rd19;
setp.lt.u64 %p9, %rd21, %rd20;
selp.u64 %rd22, 1, 0, %p9;
selp.b64 %rd23, 1, %rd22, %p8;
setp.lt.u64 %p10, %rd20, %rd9;
setp.lt.u64 %p11, %rd20, %rd5;
selp.u64 %rd24, 1, 0, %p11;
selp.b64 %rd25, 1, %rd24, %p10;
add.s64 %rd26, %rd4, %rd8;
add.s64 %rd27, %rd26, %rd25;
add.s64 %rd28, %rd27, %rd23;
st.u64 [%SP+64], %rd12;
st.u64 [%SP+72], %rd16;
st.u64 [%SP+80], %rd21;
st.u64 [%SP+88], %rd28;
mov.u32 %r8, 0;
st.u32 [%SP+96], %r8;
bra.uni LBB0_1;
LBB0_1: // =>This Inner Loop Header: Depth=1
ld.u32 %r9, [%SP+96];
setp.gt.s32 %p12, %r9, 31;
@%p12 bra LBB0_4;
bra.uni LBB0_2;
LBB0_2: // in Loop: Header=BB0_1 Depth=1
ld.s32 %rd29, [%SP+96];
add.u64 %rd30, %SP, 64;
add.s64 %rd31, %rd30, %rd29;
ld.u8 %r10, [%rd31];
st.u32 [%SP+104], %r10;
mov.u64 %rd32, _$_str;
cvta.global.u64 %rd33, %rd32;
add.u64 %rd34, %SP, 104;
{ // callseq 0, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd33;
.param .b64 param1;
st.param.b64 [param1+0], %rd34;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r11, [retval0+0];
} // callseq 0
bra.uni LBB0_3;
LBB0_3: // in Loop: Header=BB0_1 Depth=1
ld.u32 %r13, [%SP+96];
add.s32 %r14, %r13, 1;
st.u32 [%SP+96], %r14;
bra.uni LBB0_1;
LBB0_4:
ret;
// -- End function
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment