From: Quentin Colombet <qcolombet@apple.com>
Date: Wed, 27 May 2015 06:28:41 +0000 (+0000)
Subject: [X86] Implement the support for shrink-wrapping.
X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=commitdiff_plain;h=60c91c28e4d2f49c3eaac1b20125854baa8d4c7c

[X86] Implement the support for shrink-wrapping.
With this patch the x86 backend is now shrink-wrapping capable
and this functionality can be tested by using the
-enable-shrink-wrap switch.

The next step is to make more test and enable shrink-wrapping by
default for x86.

Related to <rdar://problem/20821487>


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@238293 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/Target/X86/X86ExpandPseudo.cpp b/lib/Target/X86/X86ExpandPseudo.cpp
index 29ca3736aca..1b00997e750 100644
--- a/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/lib/Target/X86/X86ExpandPseudo.cpp
@@ -88,8 +88,9 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
       const bool Uses64BitFramePtr =
           STI->isTarget64BitLP64() || STI->isTargetNaCl64();
-      bool UseLEAForSP =
-          X86FL->useLEAForSPInProlog(*MBB.getParent());
+      // Check if we should use LEA for SP.
+      bool UseLEAForSP = STI->useLeaForSP() &&
+                         X86FL->canUseLEAForSPInEpilogue(*MBB.getParent());
       unsigned StackPtr = TRI->getStackRegister();
       // Check for possible merge with preceding ADD instruction.
       StackAdj += X86FrameLowering::mergeSPUpdates(MBB, MBBI, StackPtr, true);
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 50377bdf586..db58d9c5f30 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -565,7 +565,6 @@ static uint64_t calculateMaxStackAlign(const MachineFunction &MF) {
 
 void X86FrameLowering::emitPrologue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {
-  assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const Function *Fn = MF.getFunction();
@@ -965,15 +964,38 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   }
 }
 
-bool X86FrameLowering::useLEAForSPInProlog(const MachineFunction &MF) const {
+bool X86FrameLowering::canUseLEAForSPInEpilogue(
+    const MachineFunction &MF) const {
   // We can't use LEA instructions for adjusting the stack pointer if this is a
   // leaf function in the Win64 ABI.  Only ADD instructions may be used to
   // deallocate the stack.
   // This means that we can use LEA for SP in two situations:
   // 1. We *aren't* using the Win64 ABI which means we are free to use LEA.
   // 2. We *have* a frame pointer which means we are permitted to use LEA.
-  return MF.getSubtarget<X86Subtarget>().useLeaForSP() &&
-         (!MF.getTarget().getMCAsmInfo()->usesWindowsCFI() || hasFP(MF));
+  return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() || hasFP(MF);
+}
+
+/// Check whether or not the terminators of \p MBB needs to read EFLAGS.
+static bool terminatorsNeedFlagsAsInput(const MachineBasicBlock &MBB) {
+  for (const MachineInstr &MI : MBB.terminators()) {
+    bool BreakNext = false;
+    for (const MachineOperand &MO : MI.operands()) {
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (Reg != X86::EFLAGS)
+        continue;
+
+      // This terminator needs an eflag that is not defined
+      // by a previous terminator.
+      if (!MO.isDef())
+        return true;
+      BreakNext = true;
+    }
+    if (BreakNext)
+      break;
+  }
+  return false;
 }
 
 void X86FrameLowering::emitEpilogue(MachineFunction &MF,
@@ -983,9 +1005,10 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
   const X86RegisterInfo *RegInfo = STI.getRegisterInfo();
   const TargetInstrInfo &TII = *STI.getInstrInfo();
-  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
-  assert(MBBI != MBB.end() && "Returning block has no instructions");
-  DebugLoc DL = MBBI->getDebugLoc();
+  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+  DebugLoc DL;
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
   bool Is64Bit = STI.is64Bit();
   // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
   const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
@@ -999,25 +1022,18 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
 
   bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
   bool NeedsWinEH = IsWinEH && MF.getFunction()->needsUnwindTableEntry();
-  bool UseLEAForSP = useLEAForSPInProlog(MF);
-
-  switch (MBBI->getOpcode()) {
-  default:
-    llvm_unreachable("Can only insert epilogue into returning blocks");
-  case X86::RETQ:
-  case X86::RETL:
-  case X86::RETIL:
-  case X86::RETIQ:
-  case X86::TCRETURNdi:
-  case X86::TCRETURNri:
-  case X86::TCRETURNmi:
-  case X86::TCRETURNdi64:
-  case X86::TCRETURNri64:
-  case X86::TCRETURNmi64:
-  case X86::EH_RETURN:
-  case X86::EH_RETURN64:
-    break;  // These are ok
-  }
+  bool UseLEAForSP = canUseLEAForSPInEpilogue(MF);
+  // If we can use LEA for SP but we shouldn't, check that none
+  // of the terminators uses the eflags. Otherwise we will insert
+  // a ADD that will redefine the eflags and break the condition.
+  // Alternatively, we could move the ADD, but this may not be possible
+  // and is an optimization anyway.
+  if (UseLEAForSP && !MF.getSubtarget<X86Subtarget>().useLeaForSP())
+    UseLEAForSP = terminatorsNeedFlagsAsInput(MBB);
+  // If that assert breaks, that means we do not do the right thing
+  // in canUseAsEpilogue.
+  assert((UseLEAForSP || !terminatorsNeedFlagsAsInput(MBB)) &&
+         "We shouldn't have allowed this insertion point");
 
   // Get the number of bytes to allocate from the FrameInfo.
   uint64_t StackSize = MFI->getStackSize();
@@ -1056,7 +1072,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   }
   MachineBasicBlock::iterator FirstCSPop = MBBI;
 
-  DL = MBBI->getDebugLoc();
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
 
   // If there is an ADD32ri or SUB32ri of ESP immediately before this
   // instruction, merge the two instructions.
@@ -1514,8 +1531,6 @@ static const uint64_t kSplitStackAvailable = 256;
 
 void X86FrameLowering::adjustForSegmentedStacks(
     MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
-  assert(&PrologueMBB == &MF.front() &&
-         "Shrink-wrapping is not implemented yet");
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
   const TargetInstrInfo &TII = *STI.getInstrInfo();
@@ -1835,8 +1850,6 @@ void X86FrameLowering::adjustForHiPEPrologue(
   // If the stack frame needed is larger than the guaranteed then runtime checks
   // and calls to "inc_stack_0" BIF should be inserted in the assembly prologue.
   if (MaxStack > Guaranteed) {
-    assert(&PrologueMBB == &MF.front() &&
-           "Shrink-wrapping is not implemented yet");
     MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock();
     MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock();
 
@@ -1979,3 +1992,15 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
   }
 }
 
+bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
+  assert(MBB.getParent() && "Block is not attached to a function!");
+
+  if (canUseLEAForSPInEpilogue(*MBB.getParent()))
+    return true;
+
+  // If we cannot use LEA to adjust SP, we may need to use ADD, which
+  // clobbers the EFLAGS. Check that none of the terminators reads the
+  // EFLAGS, and if one uses it, conservatively assume this is not
+  // safe to insert the epilogue here.
+  return !terminatorsNeedFlagsAsInput(MBB);
+}
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index eeeda75d337..5d03b4db45c 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -96,8 +96,15 @@ public:
                            const TargetInstrInfo &TII,
                            const TargetRegisterInfo &TRI);
 
-  /// Check that LEA can be use on SP in a prologue sequence for \p MF.
-  bool useLEAForSPInProlog(const MachineFunction &MF) const;
+  /// Check that LEA can be used on SP in an epilogue sequence for \p MF.
+  bool canUseLEAForSPInEpilogue(const MachineFunction &MF) const;
+
+  /// Check whether or not the given \p MBB can be used as a epilogue
+  /// for the target.
+  /// The epilogue will be inserted before the first terminator of that block.
+  /// This method is used by the shrink-wrapping pass to decide if
+  /// \p MBB will be correctly handled by the target.
+  bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
 
 private:
   /// convertArgMovsToPushes - This method tries to convert a call sequence
diff --git a/test/CodeGen/X86/x86-shrink-wrapping.ll b/test/CodeGen/X86/x86-shrink-wrapping.ll
new file mode 100644
index 00000000000..5848eddf437
--- /dev/null
+++ b/test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -0,0 +1,600 @@
+; RUN: llc %s -o - -enable-shrink-wrap=true | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
+; RUN: llc %s -o - -enable-shrink-wrap=false | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
+;
+; Note: Lots of tests use inline asm instead of regular calls.
+; This allows to have a better control on what the allocation will do.
+; Otherwise, we may have spill right in the entry block, defeating
+; shrink-wrapping. Moreover, some of the inline asm statement (nop)
+; are here to ensure that the related paths do not end up as critical
+; edges.
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+
+; Initial motivating example: Simple diamond with a call just on one side.
+; CHECK-LABEL: foo:
+;
+; Compare the arguments and jump to exit.
+; No prologue needed.
+; ENABLE: movl %edi, [[ARG0CPY:%e[a-z]+]]
+; ENABLE-NEXT: cmpl %esi, [[ARG0CPY]]
+; ENABLE-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; (What we push does not matter. It should be some random sratch register.)
+; CHECK: pushq
+;
+; Compare the arguments and jump to exit.
+; After the prologue is set.
+; DISABLE: movl %edi, [[ARG0CPY:%e[a-z]+]]
+; DISABLE-NEXT: cmpl %esi, [[ARG0CPY]]
+; DISABLE-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
+;
+; Store %a in the alloca.
+; CHECK: movl [[ARG0CPY]], 4(%rsp)
+; Set the alloca address in the second argument.
+; CHECK-NEXT: leaq 4(%rsp), %rsi
+; Set the first argument to zero.
+; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: callq _doSomething
+;
+; With shrink-wrapping, epilogue is just after the call.
+; ENABLE-NEXT: addq $8, %rsp
+;
+; CHECK: [[EXIT_LABEL]]:
+;
+; Without shrink-wrapping, epilogue is in the exit block.
+; Epilogue code. (What we pop does not matter.)
+; DISABLE-NEXT: popq
+;
+; CHECK-NEXT: retq
+define i32 @foo(i32 %a, i32 %b) {
+  %tmp = alloca i32, align 4
+  %tmp2 = icmp slt i32 %a, %b
+  br i1 %tmp2, label %true, label %false
+
+true:
+  store i32 %a, i32* %tmp, align 4
+  %tmp4 = call i32 @doSomething(i32 0, i32* %tmp)
+  br label %false
+
+false:
+  %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ]
+  ret i32 %tmp.0
+}
+
+; Function Attrs: optsize
+declare i32 @doSomething(i32, i32*)
+
+
+; Check that we do not perform the restore inside the loop whereas the save
+; is outside.
+; CHECK-LABEL: freqSaveAndRestoreOutsideLoop:
+;
+; Shrink-wrapping allows to skip the prologue in the else case.
+; ENABLE: testl %edi, %edi  
+; ENABLE: je [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; Make sure we save the CSR used in the inline asm: rbx.
+; CHECK: pushq %rbx
+;
+; DISABLE: testl %edi, %edi
+; DISABLE: je [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; SUM is in %esi because it is coalesced with the second
+; argument on the else path.
+; CHECK: xorl [[SUM:%esi]], [[SUM]]
+; CHECK-NEXT: movl $10, [[IV:%e[a-z]+]]
+;
+; Next BB.
+; CHECK: [[LOOP:LBB[0-9_]+]]: ## %for.body
+; CHECK: movl $1, [[TMP:%e[a-z]+]]
+; CHECK: addl [[TMP]], [[SUM]]
+; CHECK-NEXT: decl [[IV]]
+; CHECK-NEXT: jne [[LOOP]]
+;
+; Next BB.
+; SUM << 3.
+; CHECK: shll $3, [[SUM]]
+;
+; Jump to epilogue.
+; DISABLE: jmp [[EPILOG_BB:LBB[0-9_]+]]
+;
+; DISABLE: [[ELSE_LABEL]]: ## %if.else
+; Shift second argument by one and store into returned register.
+; DISABLE: addl %esi, %esi
+; DISABLE: [[EPILOG_BB]]: ## %if.end
+;
+; Epilogue code.
+; CHECK-DAG: popq %rbx
+; CHECK-DAG: movl %esi, %eax
+; CHECK: retq
+;
+; ENABLE: [[ELSE_LABEL]]: ## %if.else
+; Shift second argument by one and store into returned register.
+; ENABLE: addl %esi, %esi
+; ENABLE-NEXT: movl %esi, %eax
+; ENABLE-NEXT: retq
+define i32 @freqSaveAndRestoreOutsideLoop(i32 %cond, i32 %N) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.else, label %for.preheader
+
+for.preheader:
+  tail call void asm "nop", ""()
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.05 = phi i32 [ %inc, %for.body ], [ 0, %for.preheader ]
+  %sum.04 = phi i32 [ %add, %for.body ], [ 0, %for.preheader ]
+  %call = tail call i32 asm "movl $$1, $0", "=r,~{ebx}"()
+  %add = add nsw i32 %call, %sum.04
+  %inc = add nuw nsw i32 %i.05, 1
+  %exitcond = icmp eq i32 %inc, 10
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %shl = shl i32 %add, 3
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %mul = shl nsw i32 %N, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %for.end
+  %sum.1 = phi i32 [ %shl, %for.end ], [ %mul, %if.else ]
+  ret i32 %sum.1
+}
+
+declare i32 @something(...)
+
+; Check that we do not perform the shrink-wrapping inside the loop even
+; though that would be legal. The cost model must prevent that.
+; CHECK-LABEL: freqSaveAndRestoreOutsideLoop2:
+; Prologue code.
+; Make sure we save the CSR used in the inline asm: rbx.
+; CHECK: pushq %rbx
+; CHECK: nop
+; CHECK: xorl [[SUM:%e[a-z]+]], [[SUM]]
+; CHECK-NEXT: movl $10, [[IV:%e[a-z]+]]
+; Next BB.
+; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ## %for.body
+; CHECK: movl $1, [[TMP:%e[a-z]+]]
+; CHECK: addl [[TMP]], [[SUM]]
+; CHECK-NEXT: decl [[IV]]
+; CHECK-NEXT: jne [[LOOP_LABEL]]
+; Next BB.
+; CHECK: ## %for.exit
+; CHECK: nop
+; CHECK: popq %rbx
+; CHECK-NEXT: retq
+define i32 @freqSaveAndRestoreOutsideLoop2(i32 %cond) {
+entry:
+  br label %for.preheader
+
+for.preheader:
+  tail call void asm "nop", ""()
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.04 = phi i32 [ 0, %for.preheader ], [ %inc, %for.body ]
+  %sum.03 = phi i32 [ 0, %for.preheader ], [ %add, %for.body ]
+  %call = tail call i32 asm "movl $$1, $0", "=r,~{ebx}"()
+  %add = add nsw i32 %call, %sum.03
+  %inc = add nuw nsw i32 %i.04, 1
+  %exitcond = icmp eq i32 %inc, 10
+  br i1 %exitcond, label %for.exit, label %for.body
+
+for.exit:
+  tail call void asm "nop", ""()
+  br label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret i32 %add
+}
+
+; Check with a more complex case that we do not have save within the loop and
+; restore outside.
+; CHECK-LABEL: loopInfoSaveOutsideLoop:
+;
+; ENABLE: testl %edi, %edi
+; ENABLE-NEXT: je [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; Make sure we save the CSR used in the inline asm: rbx.
+; CHECK: pushq %rbx
+;
+; DISABLE: testl %edi, %edi
+; DISABLE-NEXT: je [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; CHECK: nop
+; CHECK: xorl [[SUM:%esi]], [[SUM]]
+; CHECK-NEXT: movl $10, [[IV:%e[a-z]+]]
+;
+; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ## %for.body
+; CHECK: movl $1, [[TMP:%e[a-z]+]]
+; CHECK: addl [[TMP]], [[SUM]]
+; CHECK-NEXT: decl [[IV]]
+; CHECK-NEXT: jne [[LOOP_LABEL]]
+; Next BB.
+; CHECK: nop
+; CHECK: shll $3, [[SUM]]
+;
+; DISABLE: jmp [[EPILOG_BB:LBB[0-9_]+]]
+;
+; DISABLE: [[ELSE_LABEL]]: ## %if.else
+; Shift second argument by one and store into returned register.
+; DISABLE: addl %esi, %esi
+; DISABLE: [[EPILOG_BB]]: ## %if.end
+;
+; Epilogue code.
+; CHECK-DAG: popq %rbx
+; CHECK-DAG: movl %esi, %eax
+; CHECK: retq
+;
+; ENABLE: [[ELSE_LABEL]]: ## %if.else
+; Shift second argument by one and store into returned register.
+; ENABLE: addl %esi, %esi
+; ENABLE-NEXT: movl %esi, %eax
+; ENABLE-NEXT: retq
+define i32 @loopInfoSaveOutsideLoop(i32 %cond, i32 %N) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.else, label %for.preheader
+
+for.preheader:
+  tail call void asm "nop", ""()
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.05 = phi i32 [ %inc, %for.body ], [ 0, %for.preheader ]
+  %sum.04 = phi i32 [ %add, %for.body ], [ 0, %for.preheader ]
+  %call = tail call i32 asm "movl $$1, $0", "=r,~{ebx}"()
+  %add = add nsw i32 %call, %sum.04
+  %inc = add nuw nsw i32 %i.05, 1
+  %exitcond = icmp eq i32 %inc, 10
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  tail call void asm "nop", "~{ebx}"()
+  %shl = shl i32 %add, 3
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %mul = shl nsw i32 %N, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %for.end
+  %sum.1 = phi i32 [ %shl, %for.end ], [ %mul, %if.else ]
+  ret i32 %sum.1
+}
+
+declare void @somethingElse(...)
+
+; Check with a more complex case that we do not have restore within the loop and
+; save outside.
+; CHECK-LABEL: loopInfoRestoreOutsideLoop:
+;
+; ENABLE: testl %edi, %edi
+; ENABLE-NEXT: je [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; Make sure we save the CSR used in the inline asm: rbx.
+; CHECK: pushq %rbx
+;
+; DISABLE: testl %edi, %edi
+; DISABLE-NEXT: je [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; CHECK: nop
+; CHECK: xorl [[SUM:%esi]], [[SUM]]
+; CHECK-NEXT: movl $10, [[IV:%e[a-z]+]]
+;
+; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ## %for.body
+; CHECK: movl $1, [[TMP:%e[a-z]+]]
+; CHECK: addl [[TMP]], [[SUM]]
+; CHECK-NEXT: decl [[IV]]
+; CHECK-NEXT: jne [[LOOP_LABEL]]
+; Next BB.
+; CHECK: shll $3, [[SUM]]
+;
+; DISABLE: jmp [[EPILOG_BB:LBB[0-9_]+]]
+;
+; DISABLE: [[ELSE_LABEL]]: ## %if.else
+
+; Shift second argument by one and store into returned register.
+; DISABLE: addl %esi, %esi
+; DISABLE: [[EPILOG_BB]]: ## %if.end
+;
+; Epilogue code.
+; CHECK-DAG: popq %rbx
+; CHECK-DAG: movl %esi, %eax
+; CHECK: retq
+;
+; ENABLE: [[ELSE_LABEL]]: ## %if.else
+; Shift second argument by one and store into returned register.
+; ENABLE: addl %esi, %esi
+; ENABLE-NEXT: movl %esi, %eax
+; ENABLE-NEXT: retq
+define i32 @loopInfoRestoreOutsideLoop(i32 %cond, i32 %N) #0 {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  tail call void asm "nop", "~{ebx}"()
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %if.then
+  %i.05 = phi i32 [ 0, %if.then ], [ %inc, %for.body ]
+  %sum.04 = phi i32 [ 0, %if.then ], [ %add, %for.body ]
+  %call = tail call i32 asm "movl $$1, $0", "=r,~{ebx}"()
+  %add = add nsw i32 %call, %sum.04
+  %inc = add nuw nsw i32 %i.05, 1
+  %exitcond = icmp eq i32 %inc, 10
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %shl = shl i32 %add, 3
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %mul = shl nsw i32 %N, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %for.end
+  %sum.1 = phi i32 [ %shl, %for.end ], [ %mul, %if.else ]
+  ret i32 %sum.1
+}
+
+; Check that we handle function with no frame information correctly.
+; CHECK-LABEL: emptyFrame:
+; CHECK: ## %entry
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: retq
+define i32 @emptyFrame() {
+entry:
+  ret i32 0
+}
+
+; Check that we handle inline asm correctly.
+; CHECK-LABEL: inlineAsm:
+;
+; ENABLE: testl %edi, %edi
+; ENABLE-NEXT: je [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; Make sure we save the CSR used in the inline asm: rbx.
+; CHECK: pushq %rbx
+;
+; DISABLE: testl %edi, %edi
+; DISABLE-NEXT: je [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; CHECK: nop
+; CHECK: movl $10, [[IV:%e[a-z]+]]
+;
+; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ## %for.body
+; Inline asm statement.
+; CHECK: addl $1, %ebx
+; CHECK: decl [[IV]]
+; CHECK-NEXT: jne [[LOOP_LABEL]]
+; Next BB.
+; CHECK: nop
+; CHECK: xorl %esi, %esi
+;
+; DISABLE: jmp [[EPILOG_BB:LBB[0-9_]+]]
+;
+; DISABLE: [[ELSE_LABEL]]: ## %if.else
+; Shift second argument by one and store into returned register.
+; DISABLE: addl %esi, %esi
+; DISABLE: [[EPILOG_BB]]: ## %if.end
+;
+; Epilogue code.
+; CHECK-DAG: popq %rbx
+; CHECK-DAG: movl %esi, %eax
+; CHECK: retq
+;
+; ENABLE: [[ELSE_LABEL]]: ## %if.else
+; Shift second argument by one and store into returned register.
+; ENABLE: addl %esi, %esi
+; ENABLE-NEXT: movl %esi, %eax
+; ENABLE-NEXT: retq
+define i32 @inlineAsm(i32 %cond, i32 %N) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.else, label %for.preheader
+
+for.preheader:
+  tail call void asm "nop", ""()
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.03 = phi i32 [ %inc, %for.body ], [ 0, %for.preheader ]
+  tail call void asm "addl $$1, %ebx", "~{ebx}"()
+  %inc = add nuw nsw i32 %i.03, 1
+  %exitcond = icmp eq i32 %inc, 10
+  br i1 %exitcond, label %for.exit, label %for.body
+
+for.exit:
+  tail call void asm "nop", ""()
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %mul = shl nsw i32 %N, 1
+  br label %if.end
+
+if.end:                                           ; preds = %for.body, %if.else
+  %sum.0 = phi i32 [ %mul, %if.else ], [ 0, %for.exit ]
+  ret i32 %sum.0
+}
+
+; Check that we handle calls to variadic functions correctly.
+; CHECK-LABEL: callVariadicFunc:
+;
+; ENABLE: testl %edi, %edi
+; ENABLE-NEXT: je [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; CHECK: pushq
+;
+; DISABLE: testl %edi, %edi
+; DISABLE-NEXT: je [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; Setup of the varags.
+; CHECK: movl %esi, (%rsp)
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: %esi, %edi
+; CHECK-NEXT: %esi, %edx
+; CHECK-NEXT: %esi, %r8d
+; CHECK-NEXT: %esi, %r9d
+; CHECK-NEXT: %esi, %ecx
+; CHECK-NEXT: callq _someVariadicFunc
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: shll $3, %esi
+;
+; ENABLE-NEXT: addq $8, %rsp
+; ENABLE-NEXT: movl %esi, %eax
+; ENABLE-NEXT: retq
+;
+; DISABLE: jmp [[IFEND_LABEL:LBB[0-9_]+]]
+;
+; CHECK: [[ELSE_LABEL]]: ## %if.else
+; Shift second argument by one and store into returned register.
+; CHECK: addl %esi, %esi
+;
+; DISABLE: [[IFEND_LABEL]]: ## %if.end
+;
+; Epilogue code.
+; CHECK-NEXT: movl %esi, %eax
+; DISABLE-NEXT: popq
+; CHECK-NEXT: retq
+define i32 @callVariadicFunc(i32 %cond, i32 %N) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call = tail call i32 (i32, ...) @someVariadicFunc(i32 %N, i32 %N, i32 %N, i32 %N, i32 %N, i32 %N, i32 %N)
+  %shl = shl i32 %call, 3
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %mul = shl nsw i32 %N, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %sum.0 = phi i32 [ %shl, %if.then ], [ %mul, %if.else ]
+  ret i32 %sum.0
+}
+
+declare i32 @someVariadicFunc(i32, ...)
+
+; Check that we use LEA not to clobber EFLAGS.
+%struct.temp_slot = type { %struct.temp_slot*, %struct.rtx_def*, %struct.rtx_def*, i32, i64, %union.tree_node*, %union.tree_node*, i8, i8, i32, i32, i64, i64 }
+%union.tree_node = type { %struct.tree_decl }
+%struct.tree_decl = type { %struct.tree_common, i8*, i32, i32, %union.tree_node*, i48, %union.anon, %union.tree_node*, %union.tree_node*, %union.tree_node*, %union.tree_node*, %union.tree_node*, %union.tree_node*, %union.tree_node*, %union.tree_node*, %union.tree_node*, %union.tree_node*, %struct.rtx_def*, %struct.rtx_def*, %union.anon.1, %union.tree_node*, %union.tree_node*, %union.tree_node*, i64, %struct.lang_decl* }
+%struct.tree_common = type { %union.tree_node*, %union.tree_node*, i32 }
+%union.anon = type { i64 }
+%union.anon.1 = type { %struct.function* }
+%struct.function = type { %struct.eh_status*, %struct.stmt_status*, %struct.expr_status*, %struct.emit_status*, %struct.varasm_status*, i8*, %union.tree_node*, %struct.function*, i32, i32, i32, i32, %struct.rtx_def*, %struct.ix86_args, %struct.rtx_def*, %struct.rtx_def*, i8*, %struct.initial_value_struct*, i32, %union.tree_node*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %union.tree_node*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, i64, %union.tree_node*, %union.tree_node*, %struct.rtx_def*, %struct.rtx_def*, i32, %struct.rtx_def**, %struct.temp_slot*, i32, i32, i32, %struct.var_refs_queue*, i32, i32, i8*, %union.tree_node*, %struct.rtx_def*, i32, i32, %struct.machine_function*, i32, i32, %struct.language_function*, %struct.rtx_def*, i24 }
+%struct.eh_status = type opaque
+%struct.stmt_status = type opaque
+%struct.expr_status = type { i32, i32, i32, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def* }
+%struct.emit_status = type { i32, i32, %struct.rtx_def*, %struct.rtx_def*, %union.tree_node*, %struct.sequence_stack*, i32, i32, i8*, i32, i8*, %union.tree_node**, %struct.rtx_def** }
+%struct.sequence_stack = type { %struct.rtx_def*, %struct.rtx_def*, %union.tree_node*, %struct.sequence_stack* }
+%struct.varasm_status = type opaque
+%struct.ix86_args = type { i32, i32, i32, i32, i32, i32, i32 }
+%struct.initial_value_struct = type opaque
+%struct.var_refs_queue = type { %struct.rtx_def*, i32, i32, %struct.var_refs_queue* }
+%struct.machine_function = type opaque
+%struct.language_function = type opaque
+%struct.lang_decl = type opaque
+%struct.rtx_def = type { i32, [1 x %union.rtunion_def] }
+%union.rtunion_def = type { i64 }
+
+declare hidden fastcc %struct.temp_slot* @find_temp_slot_from_address(%struct.rtx_def* readonly)
+
+; CHECK-LABEL: useLEA:
+; DISABLE: pushq 
+;
+; CHECK: testq   %rdi, %rdi
+; CHECK-NEXT: je      [[CLEANUP:LBB[0-9_]+]]
+;
+; CHECK: movzwl  (%rdi), [[BF_LOAD:%e[a-z]+]]
+; CHECK-NEXT: cmpl $66, [[BF_LOAD]]
+; CHECK-NEXT: jne [[CLEANUP]]
+;
+; CHECK: movq 8(%rdi), %rdi
+; CHECK-NEXT: movzwl (%rdi), %e[[BF_LOAD2:[a-z]+]]
+; CHECK-NEXT: leal -54(%r[[BF_LOAD2]]), [[TMP:%e[a-z]+]]
+; CHECK-NEXT: cmpl $14, [[TMP]]
+; CHECK-NEXT: ja [[LOR_LHS_FALSE:LBB[0-9_]+]]
+;
+; CHECK: movl $24599, [[TMP2:%e[a-z]+]]
+; CHECK-NEXT: btl [[TMP]], [[TMP2]]
+; CHECK-NEXT: jb [[CLEANUP]]
+;
+; CHECK: [[LOR_LHS_FALSE]]: ## %lor.lhs.false
+; CHECK: cmpl $134, %e[[BF_LOAD2]]
+; CHECK-NEXT: je [[CLEANUP]]
+;
+; CHECK: cmpl $140, %e[[BF_LOAD2]]
+; CHECK-NEXT: je [[CLEANUP]]
+;
+; ENABLE: pushq
+; CHECK: callq _find_temp_slot_from_address
+; CHECK-NEXT: testq   %rax, %rax
+;
+; The adjustment must use LEA here (or be moved above the test).
+; ENABLE-NEXT: leaq 8(%rsp), %rsp
+;
+; CHECK-NEXT: je [[CLEANUP]]
+;
+; CHECK: movb $1, 57(%rax)
+;
+; CHECK: [[CLEANUP]]: ## %cleanup
+; DISABLE: popq
+; CHECK-NEXT: retq
+define void @useLEA(%struct.rtx_def* readonly %x) {
+entry:
+  %cmp = icmp eq %struct.rtx_def* %x, null
+  br i1 %cmp, label %cleanup, label %if.end
+
+if.end:                                           ; preds = %entry
+  %tmp = getelementptr inbounds %struct.rtx_def, %struct.rtx_def* %x, i64 0, i32 0
+  %bf.load = load i32, i32* %tmp, align 8
+  %bf.clear = and i32 %bf.load, 65535
+  %cmp1 = icmp eq i32 %bf.clear, 66
+  br i1 %cmp1, label %lor.lhs.false, label %cleanup
+
+lor.lhs.false:                                    ; preds = %if.end
+  %arrayidx = getelementptr inbounds %struct.rtx_def, %struct.rtx_def* %x, i64 0, i32 1, i64 0
+  %rtx = bitcast %union.rtunion_def* %arrayidx to %struct.rtx_def**
+  %tmp1 = load %struct.rtx_def*, %struct.rtx_def** %rtx, align 8
+  %tmp2 = getelementptr inbounds %struct.rtx_def, %struct.rtx_def* %tmp1, i64 0, i32 0
+  %bf.load2 = load i32, i32* %tmp2, align 8
+  %bf.clear3 = and i32 %bf.load2, 65535
+  switch i32 %bf.clear3, label %if.end.55 [
+    i32 67, label %cleanup
+    i32 68, label %cleanup
+    i32 54, label %cleanup
+    i32 55, label %cleanup
+    i32 58, label %cleanup
+    i32 134, label %cleanup
+    i32 56, label %cleanup
+    i32 140, label %cleanup
+  ]
+
+if.end.55:                                        ; preds = %lor.lhs.false
+  %call = tail call fastcc %struct.temp_slot* @find_temp_slot_from_address(%struct.rtx_def* %tmp1) #2
+  %cmp59 = icmp eq %struct.temp_slot* %call, null
+  br i1 %cmp59, label %cleanup, label %if.then.60
+
+if.then.60:                                       ; preds = %if.end.55
+  %addr_taken = getelementptr inbounds %struct.temp_slot, %struct.temp_slot* %call, i64 0, i32 8
+  store i8 1, i8* %addr_taken, align 1
+  br label %cleanup
+
+cleanup:                                          ; preds = %if.then.60, %if.end.55, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %if.end, %entry
+  ret void
+}