- Constant materialization
- Sign extensions
- Comparisons
- Math optimisations
- Inefficient float loads
- FP comparison branchiness
- FP constant materialisation
- Unnecessary
and
afterfeq.d
- Unnecessary FP conversions
- Dead branch instruction
C
long a() { return 0x94BFE000; }
GCC RV64GC
li a0,0x4A5FF000 # lui only
slli a0,a0,1
ret
Clang RV64GC
lui a0, 74
addiw a0, a0, 1535
slli a0, a0, 13
ret
C
long a(long b) {
int c = ++b;
return c;
}
GCC
addiw a0,a0,1
ret
Clang
slli a0, a0, 32
addi a1, zero, 1
slli a1, a1, 32
add a0, a0, a1
srai a0, a0, 32
ret
C
int a(int b) {
int c = b * b, d = c % b, e = !d;
return e;
}
GCC
mulw a5,a0,a0
remw a0,a5,a0
seqz a0,a0
ret
Clang
mul a1, a0, a0
remw a0, a1, a0
slli a0, a0, 32
srli a0, a0, 32
seqz a0, a0
ret
C
int a(int b) {
int c = 6472 >> b;
return c;
}
GCC
li a5,8192
addiw a5,a5,-1720
sraw a0,a5,a0
ret
Clang
lui a1, 2
addiw a1, a1, -1720
srlw a0, a1, a0
slli a0, a0, 32
srli a0, a0, 32
ret
C
int a(int b) {
int c = 8 / b;
return c;
}
GCC
li a5,8
divw a0,a5,a0
ret
Clang
addi a1, zero, 8
div a0, a1, a0
sext.w a0, a0
ret
C
int a(int b) {
int c = -57, d = c << b;
return d;
}
GCC
li a5,-57
sllw a0,a5,a0
ret
Clang
addi a1, zero, 1
slli a1, a1, 32
addi a1, a1, -57
sllw a0, a1, a0
ret
Maybe it's just a sign extension issue, maybe it's more:
C
int a(int b) {
int c = ~b;
--c;
return c;
}
GCC
not a0,a0
addiw a0,a0,-1
ret
Clang
addi a1, zero, 1
slli a1, a1, 32
addi a1, a1, -2
subw a0, a1, a0
ret
C
int a(int b) {
short c = -b;
char d = -c;
return d;
}
GCC
andi a0,a0,0xff
ret
Clang
slli a0, a0, 16
neg a0, a0
srli a0, a0, 16
neg a0, a0
andi a0, a0, 255
ret
C
long a(long b) {
int c = -b;
return c;
}
GCC RV64GC
negw a0,a0
ret
Clang
slli a0, a0, 32
neg a0, a0
srai a0, a0, 32
ret
C
int a(int b) {
int c = 0 >= b, d = -c;
return d;
}
GCC
slti a0,a0,1
subw a0,zero,a0
ret
Clang
add a1, zero, a0
addi a2, zero, 1
addi a0, zero, -1
blt a1, a2, .LBB0_2
mv a0, zero
.LBB0_2:
ret
C
int a(short b) {
double c = 7 >= b;
++c;
return c;
}
GCC RV64GC
slti a0,a0,8
addi a0,a0,1
ret
Clang RV64GC
add a1, zero, a0
addi a2, zero, 8
addi a0, zero, 2
blt a1, a2, .LBB0_2
# %bb.1:
addi a0, zero, 1
.LBB0_2:
ret
Unnecessary seqz
.
C
float a(double b, int c) {
float d = c ?: b;
return d;
}
GCC RV64GC
beq a0,zero,.L2
fcvt.d.w fa0,a0
.L2:
fcvt.s.d fa0,fa0
ret
Clang RV64GC
seqz a1, a0
bnez a1, .LBB0_2
# %bb.1:
fcvt.d.l fa0, a0
.LBB0_2:
fcvt.s.d fa0, fa0
ret
C
int a(short b) {
short c = 0 <= b;
return c;
}
GCC RV32GC
not a0,a0
srli a0,a0,31
ret
Clang RV32GC
not a0, a0
lui a1, 8
and a0, a0, a1
srli a0, a0, 15
ret
C
int a(int b) {
int c = ~b--, d = c - b;
return d;
}
GCC
slli a0,a0,1
neg a0,a0
ret
Clang
not a1, a0
sub a0, a1, a0
addi a0, a0, 1
ret
C
int a(int b) {
int c = b + b, d = b == c;
return d;
}
GCC
seqz a0,a0
ret
Clang
slli a1, a0, 1
xor a0, a0, a1
seqz a0, a0
ret
C
int a(short b) {
char c = b, e = -b;
short d = !c;
int f = e ? d : e;
char g = f ?: 7;
return g;
}
GCC RV32GC
li a0,7
ret
Clang RV32GC
neg a1, a0
andi a1, a1, 255
bnez a1, .LBB0_3
# %bb.1:
addi a0, zero, 7
bnez a1, .LBB0_4
.LBB0_2:
ret
.LBB0_3:
andi a0, a0, 255
seqz a1, a0
addi a0, zero, 7
beqz a1, .LBB0_2
.LBB0_4:
add a0, zero, a1
ret
C
char a(char b, short c) {
int d = ++b;
char e = b ?: c;
short f = e ? c : d;
return f;
}
GCC RV64GC
andi a0,a1,0xff
ret
Clang RV32GC
addi a3, a0, 1
andi a0, a3, 255
add a2, zero, a1
beq a0, a3, .LBB0_3
# %bb.1:
andi a2, a2, 255
bnez a2, .LBB0_4
.LBB0_2:
andi a0, a0, 255
ret
.LBB0_3:
add a2, zero, a0
andi a2, a2, 255
beqz a2, .LBB0_2
.LBB0_4:
add a0, zero, a1
andi a0, a0, 255
ret
C
int a;
float b() {
double c = a == a;
return c;
}
GCC
lui a5,%hi(.LC0)
flw fa0,%lo(.LC0)(a5)
ret
Clang
lui a0, %hi(.LCPI0_0)
addi a0, a0, %lo(.LCPI0_0)
flw fa0, 0(a0)
ret
C
double a(float b) {
long c = !b;
return c;
}
GCC
fmv.s.x fa5,zero
feq.s a5,fa0,fa5
fcvt.d.w fa0,a5
ret
Clang
fmv.w.x ft0, zero
feq.s a0, fa0, ft0
bnez a0, .LBB0_2
# %bb.1:
fcvt.d.w fa0, zero
ret
.LBB0_2:
lui a0, %hi(.LCPI0_0)
addi a0, a0, %lo(.LCPI0_0)
fld fa0, 0(a0)
ret
C
float a(int b, int c) {
float d = b >= c;
return d;
}
GCC
bge a0,a1,.L3
fmv.s.x fa0,zero
ret
.L3:
lui a5,%hi(.LC0)
flw fa0,%lo(.LC0)(a5)
ret
Clang
slt a0, a0, a1
xori a0, a0, 1
bnez a0, .LBB0_2
# %bb.1:
fmv.w.x fa0, zero
ret
.LBB0_2:
lui a0, %hi(.LCPI0_0)
addi a0, a0, %lo(.LCPI0_0)
flw fa0, 0(a0)
ret
C
float a(float b) {
long c = b, d = !c;
return d;
}
GCC
fcvt.l.s a5,fa0,rtz
seqz a5,a5
fcvt.s.l fa0,a5
ret
Clang
fcvt.l.s a0, fa0, rtz
seqz a0, a0
bnez a0, .LBB0_2
# %bb.1:
fmv.w.x fa0, zero
ret
.LBB0_2:
lui a0, %hi(.LCPI0_0)
addi a0, a0, %lo(.LCPI0_0)
flw fa0, 0(a0)
ret
Bad roundtripping:
C
double a(float b) {
char c = b == b;
return c;
}
GCC
feq.s a5,fa0,fa0
fcvt.d.wu fa0,a5
ret
Clang
feq.s a0, fa0, fa0
and a0, a0, a0
bnez a0, .LBB0_2
# %bb.1:
fcvt.d.w fa0, zero
ret
.LBB0_2:
lui a0, %hi(.LCPI0_0)
addi a0, a0, %lo(.LCPI0_0)
fld fa0, 0(a0)
ret
Even though GCC also doesn't do that, we could materialise small FP constants with addi xn, zero, imm12
plus one int to FP conversion instruction. Even if for some uarchs that were slower, we could use that for -Os
.
C
float test(float a) {
return a + 1.0;
}
Clang
lui a1, %hi(.LCPI0_0)
addi a1, a1, %lo(.LCPI0_0)
flw ft0, 0(a1)
fmv.w.x ft1, a0
fadd.s ft0, ft1, ft0
fmv.x.w a0, ft0
ret
C
long a(double b) {
long c = b <= b;
return c;
}
GCC
feq.d a0,fa0,fa0
ret
Clang
feq.d a0, fa0, fa0
and a0, a0, a0
ret
C
float a(long b) {
double c = ~b;
return c;
}
GCC
not a0,a0
fcvt.s.w fa0,a0
ret
Clang
not a0, a0
fcvt.d.w ft0, a0
fcvt.s.d fa0, ft0
ret
C
int a(char b, int c, short d) {
double e = d;
int f = e ? c : e;
return f;
}
GCC
li a0,0
beq a2,zero,.L2
mv a0,a1
.L2:
ret
Clang
seqz a0, a2
bnez a0, .LBB0_2
# %bb.1:
fcvt.d.l ft0, a1
fcvt.l.d a0, ft0, rtz
ret
.LBB0_2:
fcvt.d.l ft0, a2
fcvt.l.d a0, ft0, rtz
ret
C
float a(char b) {
double c = b;
return c;
}
GCC
fcvt.s.wu fa0,a0
ret
Clang
fcvt.d.wu ft0, a0
fcvt.s.d fa0, ft0
ret
C
char b(char c, short d) {
short e = c ? d : a;
int f = c ? e : 2;
return f;
}
GCC
bnez a0,.L2
li a1,2
.L2:
andi a0,a1,0xff
ret
Clang
beqz a0, .LBB0_3
# %bb.1:
beqz a0, .LBB0_4
.LBB0_2:
andi a0, a1, 255
ret
.LBB0_3:
lui a1, %hi(a)
lw a1, %lo(a)(a1)
bnez a0, .LBB0_2
.LBB0_4:
addi a1, zero, 2
andi a0, a1, 255
ret