EarlyCSE performs unwanted optimization – Analysis
Recall that precall_hook
and postcall_hook
markers surround IR instructions containing calls to opaque functions (registers_clobbered
) which are used to say which register variables were clobbered by the original callee. All calls have attribute readonly
and nounwind
. opaque_true
function call is used to prevent DCE (we make all branch conditions opaque).
Before EarlyCSE
In the process of testing tac
coreutil binary, for a specific functions we have the following BBs (only relevant parts are included):
; *** IR Dump After Simplify the CFG ***
bb.xmalloc.0x11_L0_cloned: ; preds = %bb.xmalloc.0xc_L0_ft_cloned
%162 = load i64, i64* %rsp, align 8
%163 = add i64 %162, -8
%164 = inttoptr i64 %163 to i64*
store i64 4209786, i64* %164, align 1
store i64 %163, i64* %rsp, align 8
store i64 4209856, i64* %pc, align 8
call void @precall_hook(%struct.PlainMetaAddress { i32 0, i16 0, i16 4, i64 4209781 }, %struct.PlainMetaAddress { i32 0, i16 0, i16 4, i64 4209856 })
%165 = call i64 @registers_clobbered.491()
store i64 %165, i64* %r15, align 8
%166 = call i64 @registers_clobbered.491()
store i64 %166, i64* %rax, align 8
%167 = call i64 @registers_clobbered.491()
store i64 %167, i64* %rbp, align 8
%168 = call i64 @registers_clobbered.491()
store i64 %168, i64* %rbx, align 8
; ... other registers clobbered...
%187 = call i64 @registers_clobbered.491()
store i64 %187, i64* %state_0x8618, align 8
%188 = load i64, i64* %rsp, align 8
%189 = add i64 %188, 8
store i64 %189, i64* %rsp, align 8
call void @postcall_hook(%struct.PlainMetaAddress { i32 0, i16 0, i16 4, i64 4209781 }, %struct.PlainMetaAddress { i32 0, i16 0, i16 4, i64 4209856 })
; ...
%196 = load i64, i64* %cc_dst, align 8
%.not87859_cloned = icmp eq i64 %196, 0
%197 = call i1 @opaque_true(i1 %.not87859_cloned)
br i1 %197, label %bb.xrealloc.0x7_L0_ft_cloned, label %bb.xrealloc.0x7_L0_cloned
bb.xrealloc.0x7_L0_ft_cloned: ; preds = %bb.xmalloc.0x11_L0_cloned
%198 = load i64, i64* %rdi, align 8
store i64 %198, i64* %cc_dst, align 8
%199 = load i64, i64* %cc_dst, align 8
%.not87858_cloned = icmp eq i64 %199, 0
%200 = call i1 @opaque_true(i1 %.not87858_cloned)
br i1 %200, label %bb.xrealloc.0xc_L0_ft_cloned, label %bb.xrealloc.0xc_L0_cloned
In particular, one can notice how the two conditional branches (br i1 %197
and br i1 %200
) depend on different conditions (icmp eq
over %196
for the first one, %199
for the other one). The register variable they depend is the same (%cc_dst
), but the value is different.
Now, this is how the two BBs look after SROA passed. The alloca
s have been replaced with simpler loads of CSV, and, clearly, the stores interleaved in the callee's summary have been optimized out.
bb.xmalloc.0x11_L0_cloned: ; preds = %bb.xmalloc.0xc_L0_ft_cloned
%99 = add i64 %50, -8
%100 = inttoptr i64 %99 to i64*
store i64 4209786, i64* %100, align 1
call void @precall_hook(%struct.PlainMetaAddress { i32 0, i16 0, i16 4, i64 4209781 }, %struct.PlainMetaAddress {
i32 0, i16 0, i16 4, i64 4209856 })
%101 = call i64 @registers_clobbered.871()
%102 = call i64 @registers_clobbered.871()
%103 = call i64 @registers_clobbered.871()
%104 = call i64 @registers_clobbered.871()
%105 = call i64 @registers_clobbered.871()
%106 = call i64 @registers_clobbered.871()
%107 = call i64 @registers_clobbered.871()
; ... other registers clobbered...
%123 = call i64 @registers_clobbered.871()
%124 = add i64 %99, 5246976
call void @postcall_hook(%struct.PlainMetaAddress { i32 0, i16 0, i16 4, i64 4209781 }, %struct.PlainMetaAddress { i32 0, i16 0, i16 4, i64 4209856 })
%125 = add i64 %124, -8
%126 = inttoptr i64 %125 to i64*
store i64 %104, i64* %126, align 1
%.not87859_cloned = icmp eq i64 %107, 0
%127 = call i1 @opaque_true(i1 %.not87859_cloned)
br i1 %127, label %bb.xrealloc.0x7_L0_ft_cloned, label %bb.xrealloc.0x7_L0_cloned
bb.xrealloc.0x7_L0_ft_cloned: ; preds = %bb.xmalloc.0x11_L0_cloned
%.not87858_cloned = icmp eq i64 %105, 0
%128 = call i1 @opaque_true(i1 %.not87858_cloned)
br i1 %128, label %bb.xrealloc.0xc_L0_ft_cloned, label %bb.xrealloc.0xc_L0_cloned
Note how the two conditions have been optimized to be return values of different calls to registers_clobbered.871()
(%105
and %107
respectively).
After EarlyCSE
While EarlyCSE is running, this is how the BBs look (we are in a stage where only bb.xmalloc.0x11_L0_cloned
BB has been optimized, the other BB has not been optimized yet):
bb.xmalloc.0x11_L0_cloned: ; preds = %bb.xmalloc.0xc_L0_ft_cloned
store i64 4209786, i64* %31, align 1
call void @precall_hook(%struct.PlainMetaAddress { i32 0, i16 0, i16 4, i64 4209781 }, %struct.PlainMetaAddress { i32 0, i16 0, i16 4, i64 4209856 })
%82 = call i64 @registers_clobbered.871()
%83 = add i64 %30, 5246976
call void @postcall_hook(%struct.PlainMetaAddress { i32 0, i16 0, i16 4, i64 4209781 }, %struct.PlainMetaAddress { i32 0, i16 0, i16 4, i64 4209856 })
%84 = add i64 %83, -8
%85 = inttoptr i64 %84 to i64*
store i64 %82, i64* %85, align 1
%.not87859_cloned = icmp eq i64 %82, 0
%86 = call i1 @opaque_true(i1 %.not87859_cloned)
br i1 %86, label %bb.xrealloc.0x7_L0_ft_cloned, label %bb.xrealloc.0x7_L0_cloned
bb.xrealloc.0x7_L0_ft_cloned: ; preds = %bb.xmalloc.0x11_L0_cloned
%.not87858_cloned = icmp eq i64 %82, 0
%87 = call i1 @opaque_true(i1 %.not87858_cloned)
br i1 %87, label %bb.xrealloc.0xc_L0_ft_cloned, label %bb.xrealloc.0xc_L0_cloned
Reasonably, all the calls to registers_clobbered.871
, since identical, have been optimized out. What does this imply though? You got it: now the two conditions come from the very same return value of call i64 @registers_clobbered.871
(%82
), despite having different condition values originally. At this stage, those icmp
s instructions turn out to be really identical, and therefore they're gonna be CSE'd (took actually a while to figure everything out! :)
Confirm: this is how the two BBs look after EarlyCSE.
bb.xmalloc.0x11_L0_cloned: ; preds = %bb.xmalloc.0xc_L0_ft_cloned
store i64 4209786, i64* %31, align 1
call void @precall_hook(%struct.PlainMetaAddress { i32 0, i16 0, i16 4, i64 4209781 }, %struct.PlainMetaAddress {
i32 0, i16 0, i16 4, i64 4209856 })
%55 = call i64 @registers_clobbered.847()
%56 = add i64 %30, 5246976
call void @postcall_hook(%struct.PlainMetaAddress { i32 0, i16 0, i16 4, i64 4209781 }, %struct.PlainMetaAddress
{ i32 0, i16 0, i16 4, i64 4209856 })
%57 = add i64 %56, -8
%58 = inttoptr i64 %57 to i64*
store i64 %55, i64* %58, align 1
%.not87859_cloned = icmp eq i64 %55, 0
%59 = call i1 @opaque_true(i1 %.not87859_cloned)
br i1 %59, label %bb.xrealloc.0x7_L0_ft_cloned, label %bb.xrealloc.0x7_L0_cloned
6015
bb.xrealloc.0x7_L0_ft_cloned: ; preds = %bb.xmalloc.0x11_L0_cloned
br i1 %59, label %bb.xrealloc.0xc_L0_ft_cloned, label %bb.xrealloc.0xc_L0_cloned
LLVM optimized out the following instructions in the BB bb.xrealloc.0x7_L0_ft_cloned
:
%.not87858_cloned = icmp eq i64 %82, 0
%87 = call i1 @opaque_true(i1 %.not87858_cloned)
Looks like this if
in EarlyCSE is entered when handling instruction %.not87858_cloned = icmp eq i64 %82, 0
: the lookup
method succeedes having cached the previous identical condition (%.not87859_cloned = icmp eq i64 %82, 0
).
This is actually an issue: optimizing out the call to the opaque condition (opaque_true
) allows LLVM to perform DCE. Eventually, this may lead the indirect_branch_info
to be unable to survive, and impede us from finalizing the model.
Solution
At the beginning I kinda thought the interleaved stores should have not disappeared, so perhaps registers_clobbered
should have been marked w/ attribute inaccessiblememonly
and not readonly
. Then I realized SROA had just fulfilled its duty.
By the time I interiorized the problem altogether, the solution came pretty straightforward:
+#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/ScopedNoAliasAA.h"
#include "llvm/CodeGen/UnreachableBlockElim.h"
@@ -1780,10 +1781,10 @@ void CFEPAnalyzer<FunctionOracle>::integrateFunctionCallee(llvm::BasicBlock *BB,
for (GlobalVariable *Register : ClobberedRegisters) {
auto *CSVTy = Register->getType()->getPointerElementType();
auto *OpaqueRegistersClobberedCallee = RegistersClobberedPool
- .get(BB->getParent()->getName(),
+ .get(Register->getName(),
CSVTy,
{},
- "registers_clobbered");
+ "registers_clobbered_" + Twine(Register->getName()));
Builder.CreateStore(Builder.CreateCall(OpaqueRegistersClobberedCallee),
Register);
Now, we differentiate the various registers_clobbered
calls from the CSV being clobbered. After EarlyCSE, the IR looks exactly as we want:
bb.xmalloc.0x11_L0_cloned: ; preds = %bb.xmalloc.0xc_L0_ft_cloned
store i64 4209786, i64* %31, align 1
call void @precall_hook(%struct.PlainMetaAddress { i32 0, i16 0, i16 4, i64 4209781 }, %struct.PlainMetaAddress {
i32 0, i16 0, i16 4, i64 4209856 })
%63 = call i64 @registers_clobbered_r15()
%64 = call i64 @registers_clobbered_rax()
%65 = call i64 @registers_clobbered_rbp()
%66 = call i64 @registers_clobbered_rbx()
%67 = call i64 @registers_clobbered_rdi()
; ... other registers clobbered...
%84 = call i64 @registers_clobbered_state_0x85d8()
%85 = call i64 @registers_clobbered_state_0x8618()
call void @postcall_hook(%struct.PlainMetaAddress { i32 0, i16 0, i16 4, i64 4209781 }, %struct.PlainMetaAddress { i32 0, i16 0, i16 4, i64 4209856 })
store i64 %66, i64* %31, align 1
%.not87859_cloned = icmp eq i64 %69, 0
%86 = call i1 @opaque_true(i1 %.not87859_cloned)
br i1 %86, label %bb.xrealloc.0x7_L0_ft_cloned, label %bb.xrealloc.0x7_L0_cloned
bb.xrealloc.0x7_L0_ft_cloned: ; preds = %bb.xmalloc.0x11_L0_cloned
%.not87858_cloned = icmp eq i64 %67, 0
%87 = call i1 @opaque_true(i1 %.not87858_cloned)
br i1 %87, label %bb.xrealloc.0xc_L0_ft_cloned, label %bb.xrealloc.0xc_L0_cloned
:)