Add support for annotated disassembly output for X86 and arm.

[oota-llvm.git] / lib / Target / ARM / Thumb2SizeReduction.cpp
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp

index 65846b242215aa2a2995ef45d1d23123e221ddfa..f18f491f4995d02224f7ffd1f9ba5009815e12ce 100644 (file)
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -9,11 +9,11 @@
  
  #define DEBUG_TYPE "t2-reduce-size"
  #include "ARM.h"
-#include "ARMAddressingModes.h"
  #include "ARMBaseRegisterInfo.h"
  #include "ARMBaseInstrInfo.h"
  #include "ARMSubtarget.h"
  #include "Thumb2InstrInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
  #include "llvm/CodeGen/MachineInstr.h"
  #include "llvm/CodeGen/MachineInstrBuilder.h"
  #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -39,9 +39,9 @@ namespace {
    /// ReduceTable - A static table with information on mapping from wide
    /// opcodes to narrow
    struct ReduceEntry {
-    unsigned WideOpc;      // Wide opcode
-    unsigned NarrowOpc1;   // Narrow opcode to transform to
-    unsigned NarrowOpc2;   // Narrow opcode when it's two-address
+    uint16_t WideOpc;      // Wide opcode
+    uint16_t NarrowOpc1;   // Narrow opcode to transform to
+    uint16_t NarrowOpc2;   // Narrow opcode when it's two-address
      uint8_t  Imm1Limit;    // Limit of immediate field (bits)
      uint8_t  Imm2Limit;    // Limit of immediate field when it's two-address
      unsigned LowRegs1 : 1; // Only possible if low-registers are used
@@ -67,6 +67,7 @@ namespace {
      { ARM::t2BICrr, 0,            ARM::tBIC,     0,   0,    0,   1,  0,0, 1,0 },
      //FIXME: Disable CMN, as CCodes are backwards from compare expectations
      //{ ARM::t2CMNrr, ARM::tCMN,  0,             0,   0,    1,   0,  2,0, 0,0 },
+    { ARM::t2CMNzrr, ARM::tCMNz,  0,             0,   0,    1,   0,  2,0, 0,0 },
      { ARM::t2CMPri, ARM::tCMPi8,  0,             8,   0,    1,   0,  2,0, 0,0 },
      { ARM::t2CMPrr, ARM::tCMPhir, 0,             0,   0,    0,   0,  2,0, 0,1 },
      { ARM::t2EORrr, 0,            ARM::tEOR,     0,   0,    0,   1,  0,0, 1,0 },
@@ -82,9 +83,7 @@ namespace {
      { ARM::t2MOVi,  ARM::tMOVi8,  0,             8,   0,    1,   0,  0,0, 0,0 },
      { ARM::t2MOVi16,ARM::tMOVi8,  0,             8,   0,    1,   0,  0,0, 0,1 },
      // FIXME: Do we need the 16-bit 'S' variant?
-    { ARM::t2MOVr,ARM::tMOVgpr2gpr,0,            0,   0,    0,   0,  1,0, 0,0 },
-    { ARM::t2MOVCCr,0,            ARM::tMOVCCr,  0,   0,    0,   0,  0,1, 0,0 },
-    { ARM::t2MOVCCi,0,            ARM::tMOVCCi,  0,   8,    0,   1,  0,1, 0,0 },
+    { ARM::t2MOVr,ARM::tMOVr,     0,             0,   0,    0,   0,  1,0, 0,0 },
      { ARM::t2MUL,   0,            ARM::tMUL,     0,   0,    0,   1,  0,0, 1,0 },
      { ARM::t2MVNr,  ARM::tMVN,    0,             0,   0,    1,   0,  0,0, 0,0 },
      { ARM::t2ORRrr, 0,            ARM::tORR,     0,   0,    0,   1,  0,0, 1,0 },
@@ -99,11 +98,11 @@ namespace {
      { ARM::t2SUBrr, ARM::tSUBrr,  0,             0,   0,    1,   0,  0,0, 0,0 },
      { ARM::t2SUBSri,ARM::tSUBi3,  ARM::tSUBi8,   3,   8,    1,   1,  2,2, 0,0 },
      { ARM::t2SUBSrr,ARM::tSUBrr,  0,             0,   0,    1,   0,  2,0, 0,0 },
-    { ARM::t2SXTBr, ARM::tSXTB,   0,             0,   0,    1,   0,  1,0, 0,0 },
-    { ARM::t2SXTHr, ARM::tSXTH,   0,             0,   0,    1,   0,  1,0, 0,0 },
+    { ARM::t2SXTB,  ARM::tSXTB,   0,             0,   0,    1,   0,  1,0, 0,1 },
+    { ARM::t2SXTH,  ARM::tSXTH,   0,             0,   0,    1,   0,  1,0, 0,1 },
      { ARM::t2TSTrr, ARM::tTST,    0,             0,   0,    1,   0,  2,0, 0,0 },
-    { ARM::t2UXTBr, ARM::tUXTB,   0,             0,   0,    1,   0,  1,0, 0,0 },
-    { ARM::t2UXTHr, ARM::tUXTH,   0,             0,   0,    1,   0,  1,0, 0,0 },
+    { ARM::t2UXTB,  ARM::tUXTB,   0,             0,   0,    1,   0,  1,0, 0,1 },
+    { ARM::t2UXTH,  ARM::tUXTH,   0,             0,   0,    1,   0,  1,0, 0,1 },
  
      // FIXME: Clean this up after splitting each Thumb load / store opcode
      // into multiple ones.
@@ -148,7 +147,8 @@ namespace {
      /// ReduceOpcodeMap - Maps wide opcode to index of entry in ReduceTable.
      DenseMap<unsigned, unsigned> ReduceOpcodeMap;
  
-    bool canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use);
+    bool canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use,
+                             bool IsSelfLoop);
  
      bool VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry,
                           bool is2Addr, ARMCC::CondCodes Pred,
@@ -159,19 +159,21 @@ namespace {
  
      bool ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
                         const ReduceEntry &Entry, bool LiveCPSR,
-                       MachineInstr *CPSRDef);
+                       MachineInstr *CPSRDef, bool IsSelfLoop);
  
      /// ReduceTo2Addr - Reduce a 32-bit instruction to a 16-bit two-address
      /// instruction.
      bool ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
                         const ReduceEntry &Entry,
-                       bool LiveCPSR, MachineInstr *CPSRDef);
+                       bool LiveCPSR, MachineInstr *CPSRDef,
+                       bool IsSelfLoop);
  
      /// ReduceToNarrow - Reduce a 32-bit instruction to a 16-bit
      /// non-two-address instruction.
      bool ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
                          const ReduceEntry &Entry,
-                        bool LiveCPSR, MachineInstr *CPSRDef);
+                        bool LiveCPSR, MachineInstr *CPSRDef,
+                        bool IsSelfLoop);
  
      /// ReduceMBB - Reduce width of instructions in the specified basic block.
      bool ReduceMBB(MachineBasicBlock &MBB);
@@ -188,7 +190,7 @@ Thumb2SizeReduce::Thumb2SizeReduce() : MachineFunctionPass(ID) {
  }
  
  static bool HasImplicitCPSRDef(const MCInstrDesc &MCID) {
-  for (const unsigned *Regs = MCID.ImplicitDefs; *Regs; ++Regs)
+  for (const uint16_t *Regs = MCID.getImplicitDefs(); *Regs; ++Regs)
      if (*Regs == ARM::CPSR)
        return true;
    return false;
@@ -212,10 +214,17 @@ static bool HasImplicitCPSRDef(const MCInstrDesc &MCID) {
  /// In this case it would have been ok to narrow the mul.w to muls since there
  /// are indirect RAW dependency between the muls and the mul.w
  bool
-Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use) {
-  if (!Def || !STI->avoidCPSRPartialUpdate())
+Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use,
+                                      bool FirstInSelfLoop) {
+  // FIXME: Disable check for -Oz (aka OptimizeForSizeHarder).
+  if (!STI->avoidCPSRPartialUpdate())
      return false;
  
+  if (!Def)
+    // If this BB loops back to itself, conservatively avoid narrowing the
+    // first instruction that does partial flag update.
+    return FirstInSelfLoop;
+
    SmallSet<unsigned, 2> Defs;
    for (unsigned i = 0, e = Def->getNumOperands(); i != e; ++i) {
      const MachineOperand &MO = Def->getOperand(i);
@@ -444,7 +453,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
  
    // Add the 16-bit load / store instruction.
    DebugLoc dl = MI->getDebugLoc();
-  MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, TII->get(Opc));
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, TII->get(Opc));
    if (!isLdStMul) {
      MIB.addOperand(MI->getOperand(0));
      MIB.addOperand(MI->getOperand(1));
@@ -470,7 +479,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
  
    DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
  
-  MBB.erase(MI);
+  MBB.erase_instr(MI);
    ++NumLdSts;
    return true;
  }
@@ -478,41 +487,46 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
  bool
  Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
                                  const ReduceEntry &Entry,
-                                bool LiveCPSR, MachineInstr *CPSRDef) {
+                                bool LiveCPSR, MachineInstr *CPSRDef,
+                                bool IsSelfLoop) {
    unsigned Opc = MI->getOpcode();
    if (Opc == ARM::t2ADDri) {
      // If the source register is SP, try to reduce to tADDrSPi, otherwise
      // it's a normal reduce.
      if (MI->getOperand(1).getReg() != ARM::SP) {
-      if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef))
+      if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop))
          return true;
-      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef);
+      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop);
      }
      // Try to reduce to tADDrSPi.
      unsigned Imm = MI->getOperand(2).getImm();
      // The immediate must be in range, the destination register must be a low
-    // reg, and the condition flags must not be being set.
-    if (Imm & 3 || Imm > 1024)
+    // reg, the predicate must be "always" and the condition flags must not
+    // be being set.
+    if (Imm & 3 || Imm > 1020)
        return false;
      if (!isARMLowRegister(MI->getOperand(0).getReg()))
        return false;
+    if (MI->getOperand(3).getImm() != ARMCC::AL)
+      return false;
      const MCInstrDesc &MCID = MI->getDesc();
      if (MCID.hasOptionalDef() &&
          MI->getOperand(MCID.getNumOperands()-1).getReg() == ARM::CPSR)
        return false;
  
-    MachineInstrBuilder MIB = BuildMI(MBB, *MI, MI->getDebugLoc(),
+    MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(),
                                        TII->get(ARM::tADDrSPi))
        .addOperand(MI->getOperand(0))
        .addOperand(MI->getOperand(1))
        .addImm(Imm / 4); // The tADDrSPi has an implied scale by four.
+    AddDefaultPred(MIB);
  
      // Transfer MI flags.
      MIB.setMIFlags(MI->getFlags());
  
      DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " <<*MIB);
  
-    MBB.erase(MI);
+    MBB.erase_instr(MI);
      ++NumNarrows;
      return true;
    }
@@ -520,8 +534,7 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
    if (Entry.LowRegs1 && !VerifyLowRegs(MI))
      return false;
  
-  const MCInstrDesc &MCID = MI->getDesc();
-  if (MCID.mayLoad() || MCID.mayStore())
+  if (MI->mayLoad() || MI->mayStore())
      return ReduceLoadStore(MBB, MI, Entry);
  
    switch (Opc) {
@@ -533,26 +546,30 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
        switch (Opc) {
        default: break;
        case ARM::t2ADDSri: {
-        if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef))
+        if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop))
            return true;
          // fallthrough
        }
        case ARM::t2ADDSrr:
-        return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef);
+        return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop);
        }
      }
      break;
    }
    case ARM::t2RSBri:
    case ARM::t2RSBSri:
+  case ARM::t2SXTB:
+  case ARM::t2SXTH:
+  case ARM::t2UXTB:
+  case ARM::t2UXTH:
      if (MI->getOperand(2).getImm() == 0)
-      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef);
+      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop);
      break;
    case ARM::t2MOVi16:
      // Can convert only 'pure' immediate operands, not immediates obtained as
      // globals' addresses.
      if (MI->getOperand(1).isImm())
-      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef);
+      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop);
      break;
    case ARM::t2CMPrr: {
      // Try to reduce to the lo-reg only version first. Why there are two
@@ -562,9 +579,9 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
      // source insn opcode. So for now, we hack a local entry record to use.
      static const ReduceEntry NarrowEntry =
        { ARM::t2CMPrr,ARM::tCMPr, 0, 0, 0, 1, 1,2, 0, 0,1 };
-    if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, CPSRDef))
+    if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, CPSRDef, IsSelfLoop))
        return true;
-    return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef);
+    return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop);
    }
    }
    return false;
@@ -573,14 +590,32 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
  bool
  Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
                                  const ReduceEntry &Entry,
-                                bool LiveCPSR, MachineInstr *CPSRDef) {
+                                bool LiveCPSR, MachineInstr *CPSRDef,
+                                bool IsSelfLoop) {
  
    if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr))
      return false;
  
    unsigned Reg0 = MI->getOperand(0).getReg();
    unsigned Reg1 = MI->getOperand(1).getReg();
-  if (Reg0 != Reg1) {
+  // t2MUL is "special". The tied source operand is second, not first.
+  if (MI->getOpcode() == ARM::t2MUL) {
+    unsigned Reg2 = MI->getOperand(2).getReg();
+    // Early exit if the regs aren't all low regs.
+    if (!isARMLowRegister(Reg0) || !isARMLowRegister(Reg1)
+        || !isARMLowRegister(Reg2))
+      return false;
+    if (Reg0 != Reg2) {
+      // If the other operand also isn't the same as the destination, we
+      // can't reduce.
+      if (Reg1 != Reg0)
+        return false;
+      // Try to commute the operands to make it a 2-address instruction.
+      MachineInstr *CommutedMI = TII->commuteInstruction(MI);
+      if (!CommutedMI)
+        return false;
+    }
+  } else if (Reg0 != Reg1) {
      // Try to commute the operands to make it a 2-address instruction.
      unsigned CommOpIdx1, CommOpIdx2;
      if (!TII->findCommutedOpIndices(MI, CommOpIdx1, CommOpIdx2) ||
@@ -631,12 +666,12 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
    // Avoid adding a false dependency on partial flag update by some 16-bit
    // instructions which has the 's' bit set.
    if (Entry.PartFlag && NewMCID.hasOptionalDef() && HasCC &&
-      canAddPseudoFlagDep(CPSRDef, MI))
+      canAddPseudoFlagDep(CPSRDef, MI, IsSelfLoop))
      return false;
  
    // Add the 16-bit instruction.
    DebugLoc dl = MI->getDebugLoc();
-  MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewMCID);
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, NewMCID);
    MIB.addOperand(MI->getOperand(0));
    if (NewMCID.hasOptionalDef()) {
      if (HasCC)
@@ -660,7 +695,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
  
    DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
  
-  MBB.erase(MI);
+  MBB.erase_instr(MI);
    ++Num2Addrs;
    return true;
  }
@@ -668,7 +703,8 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
  bool
  Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
                                   const ReduceEntry &Entry,
-                                 bool LiveCPSR, MachineInstr *CPSRDef) {
+                                 bool LiveCPSR, MachineInstr *CPSRDef,
+                                 bool IsSelfLoop) {
    if (ReduceLimit != -1 && ((int)NumNarrows >= ReduceLimit))
      return false;
  
@@ -721,12 +757,12 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
    // Avoid adding a false dependency on partial flag update by some 16-bit
    // instructions which has the 's' bit set.
    if (Entry.PartFlag && NewMCID.hasOptionalDef() && HasCC &&
-      canAddPseudoFlagDep(CPSRDef, MI))
+      canAddPseudoFlagDep(CPSRDef, MI, IsSelfLoop))
      return false;
  
    // Add the 16-bit instruction.
    DebugLoc dl = MI->getDebugLoc();
-  MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewMCID);
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, NewMCID);
    MIB.addOperand(MI->getOperand(0));
    if (NewMCID.hasOptionalDef()) {
      if (HasCC)
@@ -741,7 +777,11 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
      if (i < NumOps && MCID.OpInfo[i].isOptionalDef())
        continue;
      if ((MCID.getOpcode() == ARM::t2RSBSri ||
-         MCID.getOpcode() == ARM::t2RSBri) && i == 2)
+         MCID.getOpcode() == ARM::t2RSBri ||
+         MCID.getOpcode() == ARM::t2SXTB ||
+         MCID.getOpcode() == ARM::t2SXTH ||
+         MCID.getOpcode() == ARM::t2UXTB ||
+         MCID.getOpcode() == ARM::t2UXTH) && i == 2)
        // Skip the zero immediate operand, it's now implicit.
        continue;
      bool isPred = (i < NumOps && MCID.OpInfo[i].isPredicate());
@@ -762,7 +802,7 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
  
    DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
  
-  MBB.erase(MI);
+  MBB.erase_instr(MI);
    ++NumNarrows;
    return true;
  }
@@ -807,13 +847,22 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
    // Yes, CPSR could be livein.
    bool LiveCPSR = MBB.isLiveIn(ARM::CPSR);
    MachineInstr *CPSRDef = 0;
+  MachineInstr *BundleMI = 0;
  
-  MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end();
-  MachineBasicBlock::iterator NextMII;
+  // If this BB loops back to itself, conservatively avoid narrowing the
+  // first instruction that does partial flag update.
+  bool IsSelfLoop = MBB.isSuccessor(&MBB);
+  MachineBasicBlock::instr_iterator MII = MBB.instr_begin(),E = MBB.instr_end();
+  MachineBasicBlock::instr_iterator NextMII;
    for (; MII != E; MII = NextMII) {
      NextMII = llvm::next(MII);
  
      MachineInstr *MI = &*MII;
+    if (MI->isBundle()) {
+      BundleMI = MI;
+      continue;
+    }
+
      LiveCPSR = UpdateCPSRUse(*MI, LiveCPSR);
  
      unsigned Opcode = MI->getOpcode();
@@ -822,9 +871,9 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
        const ReduceEntry &Entry = ReduceTable[OPI->second];
        // Ignore "special" cases for now.
        if (Entry.Special) {
-        if (ReduceSpecial(MBB, MI, Entry, LiveCPSR, CPSRDef)) {
+        if (ReduceSpecial(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop)) {
            Modified = true;
-          MachineBasicBlock::iterator I = prior(NextMII);
+          MachineBasicBlock::instr_iterator I = prior(NextMII);
            MI = &*I;
          }
          goto ProcessNext;
@@ -832,31 +881,46 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
  
        // Try to transform to a 16-bit two-address instruction.
        if (Entry.NarrowOpc2 &&
-          ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef)) {
+          ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop)) {
          Modified = true;
-        MachineBasicBlock::iterator I = prior(NextMII);
+        MachineBasicBlock::instr_iterator I = prior(NextMII);
          MI = &*I;
          goto ProcessNext;
        }
  
        // Try to transform to a 16-bit non-two-address instruction.
        if (Entry.NarrowOpc1 &&
-          ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef)) {
+          ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop)) {
          Modified = true;
-        MachineBasicBlock::iterator I = prior(NextMII);
+        MachineBasicBlock::instr_iterator I = prior(NextMII);
          MI = &*I;
        }
      }
  
    ProcessNext:
+    if (NextMII != E && MI->isInsideBundle() && !NextMII->isInsideBundle()) {
+      // FIXME: Since post-ra scheduler operates on bundles, the CPSR kill
+      // marker is only on the BUNDLE instruction. Process the BUNDLE
+      // instruction as we finish with the bundled instruction to work around
+      // the inconsistency.
+      if (BundleMI->killsRegister(ARM::CPSR))
+        LiveCPSR = false;
+      MachineOperand *MO = BundleMI->findRegisterDefOperand(ARM::CPSR);
+      if (MO && !MO->isDead())
+        LiveCPSR = true;
+    }
+
      bool DefCPSR = false;
      LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR, DefCPSR);
-    if (MI->getDesc().isCall())
+    if (MI->isCall()) {
        // Calls don't really set CPSR.
        CPSRDef = 0;
-    else if (DefCPSR)
+      IsSelfLoop = false;
+    } else if (DefCPSR) {
        // This is the last CPSR defining instruction.
        CPSRDef = MI;
+      IsSelfLoop = false;
+    }
    }
  
    return Modified;