Add load-folding table entries for MOVDQA.

[oota-llvm.git] / lib / Target / X86 / X86InstrInfo.cpp
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp

index e105b0f3cd8671c685e40bbead1eb15e4e55624a..9cf67b9d4ffe317a1df9a06d478bbe7ab6f049db 100644 (file)
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -18,7 +18,9 @@
  #include "X86MachineFunctionInfo.h"
  #include "X86Subtarget.h"
  #include "X86TargetMachine.h"
+#include "llvm/DerivedTypes.h"
  #include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
  #include "llvm/CodeGen/MachineFrameInfo.h"
  #include "llvm/CodeGen/MachineInstrBuilder.h"
  #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -263,6 +265,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
      { X86::MOV8rr,      X86::MOV8mr, 0 },
      { X86::MOVAPDrr,    X86::MOVAPDmr, 0 },
      { X86::MOVAPSrr,    X86::MOVAPSmr, 0 },
+    { X86::MOVDQArr,    X86::MOVDQAmr, 0 },
      { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, 0 },
      { X86::MOVPQIto64rr,X86::MOVPQI2QImr, 0 },
      { X86::MOVPS2SSrr,  X86::MOVPS2SSmr, 0 },
@@ -286,8 +289,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
      { X86::SETLEr,      X86::SETLEm, 0 },
      { X86::SETLr,       X86::SETLm, 0 },
      { X86::SETNEr,      X86::SETNEm, 0 },
+    { X86::SETNOr,      X86::SETNOm, 0 },
      { X86::SETNPr,      X86::SETNPm, 0 },
      { X86::SETNSr,      X86::SETNSm, 0 },
+    { X86::SETOr,       X86::SETOm, 0 },
      { X86::SETPr,       X86::SETPm, 0 },
      { X86::SETSr,       X86::SETSm, 0 },
      { X86::TAILJMPr,    X86::TAILJMPm, 1 },
@@ -376,6 +381,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
      { X86::MOVDDUPrr,       X86::MOVDDUPrm },
      { X86::MOVDI2PDIrr,     X86::MOVDI2PDIrm },
      { X86::MOVDI2SSrr,      X86::MOVDI2SSrm },
+    { X86::MOVDQArr,        X86::MOVDQArm },
      { X86::MOVSD2PDrr,      X86::MOVSD2PDrm },
      { X86::MOVSDrr,         X86::MOVSDrm },
      { X86::MOVSHDUPrr,      X86::MOVSHDUPrm },
@@ -489,12 +495,18 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
      { X86::CMOVNE16rr,      X86::CMOVNE16rm },
      { X86::CMOVNE32rr,      X86::CMOVNE32rm },
      { X86::CMOVNE64rr,      X86::CMOVNE64rm },
+    { X86::CMOVNO16rr,      X86::CMOVNO16rm },
+    { X86::CMOVNO32rr,      X86::CMOVNO32rm },
+    { X86::CMOVNO64rr,      X86::CMOVNO64rm },
      { X86::CMOVNP16rr,      X86::CMOVNP16rm },
      { X86::CMOVNP32rr,      X86::CMOVNP32rm },
      { X86::CMOVNP64rr,      X86::CMOVNP64rm },
      { X86::CMOVNS16rr,      X86::CMOVNS16rm },
      { X86::CMOVNS32rr,      X86::CMOVNS32rm },
      { X86::CMOVNS64rr,      X86::CMOVNS64rm },
+    { X86::CMOVO16rr,       X86::CMOVO16rm },
+    { X86::CMOVO32rr,       X86::CMOVO32rm },
+    { X86::CMOVO64rr,       X86::CMOVO64rm },
      { X86::CMOVP16rr,       X86::CMOVP16rm },
      { X86::CMOVP32rr,       X86::CMOVP32rm },
      { X86::CMOVP64rr,       X86::CMOVP64rm },
@@ -576,7 +588,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
      { X86::PMINSWrr,        X86::PMINSWrm },
      { X86::PMINUBrr,        X86::PMINUBrm },
      { X86::PMULDQrr,        X86::PMULDQrm },
-    { X86::PMULDQrr_int,    X86::PMULDQrm_int },
      { X86::PMULHUWrr,       X86::PMULHUWrm },
      { X86::PMULHWrr,        X86::PMULHWrm },
      { X86::PMULLDrr,        X86::PMULLDrm },
@@ -672,6 +683,7 @@ bool X86InstrInfo::isMoveInstr(const MachineInstr& MI,
    case X86::FsMOVAPDrr:
    case X86::MOVAPSrr:
    case X86::MOVAPDrr:
+  case X86::MOVDQArr:
    case X86::MOVSS2PSrr:
    case X86::MOVSD2PDrr:
    case X86::MOVPS2SSrr:
@@ -688,7 +700,7 @@ bool X86InstrInfo::isMoveInstr(const MachineInstr& MI,
    }
  }
  
-unsigned X86InstrInfo::isLoadFromStackSlot(MachineInstr *MI, 
+unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, 
                                             int &FrameIndex) const {
    switch (MI->getOpcode()) {
    default: break;
@@ -703,6 +715,7 @@ unsigned X86InstrInfo::isLoadFromStackSlot(MachineInstr *MI,
    case X86::MOVSDrm:
    case X86::MOVAPSrm:
    case X86::MOVAPDrm:
+  case X86::MOVDQArm:
    case X86::MMX_MOVD64rm:
    case X86::MMX_MOVQ64rm:
      if (MI->getOperand(1).isFI() && MI->getOperand(2).isImm() &&
@@ -718,7 +731,7 @@ unsigned X86InstrInfo::isLoadFromStackSlot(MachineInstr *MI,
    return 0;
  }
  
-unsigned X86InstrInfo::isStoreToStackSlot(MachineInstr *MI,
+unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
                                            int &FrameIndex) const {
    switch (MI->getOpcode()) {
    default: break;
@@ -733,6 +746,7 @@ unsigned X86InstrInfo::isStoreToStackSlot(MachineInstr *MI,
    case X86::MOVSDmr:
    case X86::MOVAPSmr:
    case X86::MOVAPDmr:
+  case X86::MOVDQAmr:
    case X86::MMX_MOVD64mr:
    case X86::MMX_MOVQ64mr:
    case X86::MMX_MOVNTQmr:
@@ -786,6 +800,7 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI) const {
      case X86::MOVSDrm:
      case X86::MOVAPSrm:
      case X86::MOVAPDrm:
+    case X86::MOVDQArm:
      case X86::MMX_MOVD64rm:
      case X86::MMX_MOVQ64rm: {
        // Loads from constant pools are trivially rematerializable.
@@ -848,12 +863,13 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI) const {
  /// two instructions it assumes it's not safe.
  static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator I) {
+  // It's always safe to clobber EFLAGS at the end of a block.
+  if (I == MBB.end())
+    return true;
+
    // For compile time consideration, if we are not able to determine the
    // safety after visiting 2 instructions, we will assume it's not safe.
    for (unsigned i = 0; i < 2; ++i) {
-    if (I == MBB.end())
-      // Reached end of block, it's safe.
-      return true;
      bool SeenDef = false;
      for (unsigned j = 0, e = I->getNumOperands(); j != e; ++j) {
        MachineOperand &MO = I->getOperand(j);
@@ -870,6 +886,10 @@ static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
        // This instruction defines EFLAGS, no need to look any further.
        return true;
      ++I;
+
+    // If we make it to the end of the block, it's safe to clobber EFLAGS.
+    if (I == MBB.end())
+      return true;
    }
  
    // Conservative answer.
@@ -931,7 +951,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
  /// from the argument area of a function if it does not change.  This should
  /// only return true of *all* loads the instruction does are invariant (if it
  /// does multiple loads).
-bool X86InstrInfo::isInvariantLoad(MachineInstr *MI) const {
+bool X86InstrInfo::isInvariantLoad(const MachineInstr *MI) const {
    // This code cares about loads from three cases: constant pool entries,
    // invariant argument slots, and global stubs.  In order to handle these cases
    // for all of the myriad of X86 instructions, we just scan for a CP/FI/GV
@@ -1098,7 +1118,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
      switch (MIOpc) {
      default: return 0;
      case X86::INC64r:
-    case X86::INC32r: {
+    case X86::INC32r:
+    case X86::INC64_32r: {
        assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
        unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
          : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
@@ -1116,7 +1137,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
                             Src, isKill, 1);
        break;
      case X86::DEC64r:
-    case X86::DEC32r: {
+    case X86::DEC32r:
+    case X86::DEC64_32r: {
        assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
        unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
          : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
@@ -1248,26 +1270,14 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
      case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
      }
      unsigned Amt = MI->getOperand(3).getImm();
-    unsigned A = MI->getOperand(0).getReg();
-    unsigned B = MI->getOperand(1).getReg();
-    unsigned C = MI->getOperand(2).getReg();
-    bool AisDead = MI->getOperand(0).isDead();
-    bool BisKill = MI->getOperand(1).isKill();
-    bool CisKill = MI->getOperand(2).isKill();
-    // If machine instrs are no longer in two-address forms, update
-    // destination register as well.
-    if (A == B) {
-      // Must be two address instruction!
-      assert(MI->getDesc().getOperandConstraint(0, TOI::TIED_TO) &&
-             "Expecting a two-address instruction!");
-      A = C;
-      CisKill = false;
+    if (NewMI) {
+      MachineFunction &MF = *MI->getParent()->getParent();
+      MI = MF.CloneMachineInstr(MI);
+      NewMI = false;
      }
-    MachineFunction &MF = *MI->getParent()->getParent();
-    return BuildMI(MF, get(Opc))
-      .addReg(A, true, false, false, AisDead)
-      .addReg(C, false, false, CisKill)
-      .addReg(B, false, false, BisKill).addImm(Size-Amt);
+    MI->setDesc(get(Opc));
+    MI->getOperand(3).setImm(Size-Amt);
+    return TargetInstrInfoImpl::commuteInstruction(MI, NewMI);
    }
    case X86::CMOVB16rr:
    case X86::CMOVB32rr:
@@ -1310,7 +1320,13 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
    case X86::CMOVP64rr:
    case X86::CMOVNP16rr:
    case X86::CMOVNP32rr:
-  case X86::CMOVNP64rr: {
+  case X86::CMOVNP64rr:
+  case X86::CMOVO16rr:
+  case X86::CMOVO32rr:
+  case X86::CMOVO64rr:
+  case X86::CMOVNO16rr:
+  case X86::CMOVNO32rr:
+  case X86::CMOVNO64rr: {
      unsigned Opc = 0;
      switch (MI->getOpcode()) {
      default: break;
@@ -1356,8 +1372,18 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
      case X86::CMOVNP16rr: Opc = X86::CMOVP16rr; break;
      case X86::CMOVNP32rr: Opc = X86::CMOVP32rr; break;
      case X86::CMOVNP64rr: Opc = X86::CMOVP64rr; break;
+    case X86::CMOVO16rr:  Opc = X86::CMOVNO16rr; break;
+    case X86::CMOVO32rr:  Opc = X86::CMOVNO32rr; break;
+    case X86::CMOVO64rr:  Opc = X86::CMOVNO32rr; break;
+    case X86::CMOVNO16rr: Opc = X86::CMOVO16rr; break;
+    case X86::CMOVNO32rr: Opc = X86::CMOVO32rr; break;
+    case X86::CMOVNO64rr: Opc = X86::CMOVO64rr; break;
+    }
+    if (NewMI) {
+      MachineFunction &MF = *MI->getParent()->getParent();
+      MI = MF.CloneMachineInstr(MI);
+      NewMI = false;
      }
-
      MI->setDesc(get(Opc));
      // Fallthrough intended.
    }
@@ -1458,88 +1484,101 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
                                   MachineBasicBlock *&TBB,
                                   MachineBasicBlock *&FBB,
                                   SmallVectorImpl<MachineOperand> &Cond) const {
-  // If the block has no terminators, it just falls into the block after it.
+  // Start from the bottom of the block and work up, examining the
+  // terminator instructions.
    MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin() || !isBrAnalysisUnpredicatedTerminator(--I, *this))
-    return false;
-
-  // Get the last instruction in the block.
-  MachineInstr *LastInst = I;
-  
-  // If there is only one terminator instruction, process it.
-  if (I == MBB.begin() || !isBrAnalysisUnpredicatedTerminator(--I, *this)) {
-    if (!LastInst->getDesc().isBranch())
+  while (I != MBB.begin()) {
+    --I;
+    // Working from the bottom, when we see a non-terminator
+    // instruction, we're done.
+    if (!isBrAnalysisUnpredicatedTerminator(I, *this))
+      break;
+    // A terminator that isn't a branch can't easily be handled
+    // by this analysis.
+    if (!I->getDesc().isBranch())
        return true;
-    
-    // If the block ends with a branch there are 3 possibilities:
-    // it's an unconditional, conditional, or indirect branch.
-    
-    if (LastInst->getOpcode() == X86::JMP) {
-      TBB = LastInst->getOperand(0).getMBB();
-      return false;
+    // Handle unconditional branches.
+    if (I->getOpcode() == X86::JMP) {
+      // If the block has any instructions after a JMP, delete them.
+      while (next(I) != MBB.end())
+        next(I)->eraseFromParent();
+      Cond.clear();
+      FBB = 0;
+      // Delete the JMP if it's equivalent to a fall-through.
+      if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+        TBB = 0;
+        I->eraseFromParent();
+        I = MBB.end();
+        continue;
+      }
+      // TBB is used to indicate the unconditinal destination.
+      TBB = I->getOperand(0).getMBB();
+      continue;
      }
-    X86::CondCode BranchCode = GetCondFromBranchOpc(LastInst->getOpcode());
+    // Handle conditional branches.
+    X86::CondCode BranchCode = GetCondFromBranchOpc(I->getOpcode());
      if (BranchCode == X86::COND_INVALID)
        return true;  // Can't handle indirect branch.
-
-    // Otherwise, block ends with fall-through condbranch.
-    TBB = LastInst->getOperand(0).getMBB();
-    Cond.push_back(MachineOperand::CreateImm(BranchCode));
-    return false;
-  }
-  
-  // Get the instruction before it if it's a terminator.
-  MachineInstr *SecondLastInst = I;
-  
-  // If there are three terminators, we don't know what sort of block this is.
-  if (SecondLastInst && I != MBB.begin() &&
-      isBrAnalysisUnpredicatedTerminator(--I, *this))
-    return true;
-
-  // If the block ends with X86::JMP and a conditional branch, handle it.
-  X86::CondCode BranchCode = GetCondFromBranchOpc(SecondLastInst->getOpcode());
-  if (BranchCode != X86::COND_INVALID && LastInst->getOpcode() == X86::JMP) {
-    TBB = SecondLastInst->getOperand(0).getMBB();
-    Cond.push_back(MachineOperand::CreateImm(BranchCode));
-    FBB = LastInst->getOperand(0).getMBB();
-    return false;
-  }
-
-  // If the block ends with two X86::JMPs, handle it.  The second one is not
-  // executed, so remove it.
-  if (SecondLastInst->getOpcode() == X86::JMP && 
-      LastInst->getOpcode() == X86::JMP) {
-    TBB = SecondLastInst->getOperand(0).getMBB();
-    I = LastInst;
-    I->eraseFromParent();
-    return false;
+    // Working from the bottom, handle the first conditional branch.
+    if (Cond.empty()) {
+      FBB = TBB;
+      TBB = I->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(BranchCode));
+      continue;
+    }
+    // Handle subsequent conditional branches. Only handle the case
+    // where all conditional branches branch to the same destination
+    // and their condition opcodes fit one of the special
+    // multi-branch idioms.
+    assert(Cond.size() == 1);
+    assert(TBB);
+    // Only handle the case where all conditional branches branch to
+    // the same destination.
+    if (TBB != I->getOperand(0).getMBB())
+      return true;
+    X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
+    // If the conditions are the same, we can leave them alone.
+    if (OldBranchCode == BranchCode)
+      continue;
+    // If they differ, see if they fit one of the known patterns.
+    // Theoretically we could handle more patterns here, but
+    // we shouldn't expect to see them if instruction selection
+    // has done a reasonable job.
+    if ((OldBranchCode == X86::COND_NP &&
+         BranchCode == X86::COND_E) ||
+        (OldBranchCode == X86::COND_E &&
+         BranchCode == X86::COND_NP))
+      BranchCode = X86::COND_NP_OR_E;
+    else if ((OldBranchCode == X86::COND_P &&
+              BranchCode == X86::COND_NE) ||
+             (OldBranchCode == X86::COND_NE &&
+              BranchCode == X86::COND_P))
+      BranchCode = X86::COND_NE_OR_P;
+    else
+      return true;
+    // Update the MachineOperand.
+    Cond[0].setImm(BranchCode);
    }
  
-  // Otherwise, can't handle this.
-  return true;
+  return false;
  }
  
  unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
    MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin()) return 0;
-  --I;
-  if (I->getOpcode() != X86::JMP && 
-      GetCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
-    return 0;
-  
-  // Remove the branch.
-  I->eraseFromParent();
-  
-  I = MBB.end();
-  
-  if (I == MBB.begin()) return 1;
-  --I;
-  if (GetCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
-    return 1;
+  unsigned Count = 0;
+
+  while (I != MBB.begin()) {
+    --I;
+    if (I->getOpcode() != X86::JMP &&
+        GetCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
+      break;
+    // Remove the branch.
+    I->eraseFromParent();
+    I = MBB.end();
+    ++Count;
+  }
    
-  // Remove the branch.
-  I->eraseFromParent();
-  return 2;
+  return Count;
  }
  
  static const MachineInstrBuilder &X86InstrAddOperand(MachineInstrBuilder &MIB,
@@ -1574,23 +1613,43 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
    assert((Cond.size() == 1 || Cond.size() == 0) &&
           "X86 branch conditions have one component!");
  
-  if (FBB == 0) { // One way branch.
-    if (Cond.empty()) {
-      // Unconditional branch?
-      BuildMI(&MBB, get(X86::JMP)).addMBB(TBB);
-    } else {
-      // Conditional branch.
-      unsigned Opc = GetCondBranchFromCond((X86::CondCode)Cond[0].getImm());
-      BuildMI(&MBB, get(Opc)).addMBB(TBB);
-    }
+  if (Cond.empty()) {
+    // Unconditional branch?
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, get(X86::JMP)).addMBB(TBB);
      return 1;
    }
-  
-  // Two-way Conditional branch.
-  unsigned Opc = GetCondBranchFromCond((X86::CondCode)Cond[0].getImm());
-  BuildMI(&MBB, get(Opc)).addMBB(TBB);
-  BuildMI(&MBB, get(X86::JMP)).addMBB(FBB);
-  return 2;
+
+  // Conditional branch.
+  unsigned Count = 0;
+  X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
+  switch (CC) {
+  case X86::COND_NP_OR_E:
+    // Synthesize NP_OR_E with two branches.
+    BuildMI(&MBB, get(X86::JNP)).addMBB(TBB);
+    ++Count;
+    BuildMI(&MBB, get(X86::JE)).addMBB(TBB);
+    ++Count;
+    break;
+  case X86::COND_NE_OR_P:
+    // Synthesize NE_OR_P with two branches.
+    BuildMI(&MBB, get(X86::JNE)).addMBB(TBB);
+    ++Count;
+    BuildMI(&MBB, get(X86::JP)).addMBB(TBB);
+    ++Count;
+    break;
+  default: {
+    unsigned Opc = GetCondBranchFromCond(CC);
+    BuildMI(&MBB, get(Opc)).addMBB(TBB);
+    ++Count;
+  }
+  }
+  if (FBB) {
+    // Two-way Conditional branch. Insert the second branch.
+    BuildMI(&MBB, get(X86::JMP)).addMBB(FBB);
+    ++Count;
+  }
+  return Count;
  }
  
  bool X86InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
@@ -1850,7 +1909,8 @@ bool X86InstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
      unsigned Reg = CSI[i-1].getReg();
      // Add the callee-saved register as live-in. It's killed at the spill.
      MBB.addLiveIn(Reg);
-    BuildMI(MBB, MI, get(Opc)).addReg(Reg);
+    BuildMI(MBB, MI, get(Opc))
+      .addReg(Reg, /*isDef=*/false, /*isImp=*/false, /*isKill=*/true);
    }
    return true;
  }
@@ -1872,7 +1932,7 @@ bool X86InstrInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
  }
  
  static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
-                                     const SmallVector<MachineOperand,4> &MOs,
+                                     const SmallVectorImpl<MachineOperand> &MOs,
                                   MachineInstr *MI, const TargetInstrInfo &TII) {
    // Create the base instruction with the memory operand as the first part.
    MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode), true);
@@ -1898,7 +1958,7 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
  
  static MachineInstr *FuseInst(MachineFunction &MF,
                                unsigned Opcode, unsigned OpNo,
-                              const SmallVector<MachineOperand,4> &MOs,
+                              const SmallVectorImpl<MachineOperand> &MOs,
                                MachineInstr *MI, const TargetInstrInfo &TII) {
    MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode), true);
    MachineInstrBuilder MIB(NewMI);
@@ -1920,7 +1980,7 @@ static MachineInstr *FuseInst(MachineFunction &MF,
  }
  
  static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
-                                const SmallVector<MachineOperand,4> &MOs,
+                                const SmallVectorImpl<MachineOperand> &MOs,
                                  MachineInstr *MI) {
    MachineFunction &MF = *MI->getParent()->getParent();
    MachineInstrBuilder MIB = BuildMI(MF, TII.get(Opcode));
@@ -1934,9 +1994,9 @@ static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
  }
  
  MachineInstr*
-X86InstrInfo::foldMemoryOperand(MachineFunction &MF,
-                                MachineInstr *MI, unsigned i,
-                                const SmallVector<MachineOperand,4> &MOs) const{
+X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                    MachineInstr *MI, unsigned i,
+                                    const SmallVectorImpl<MachineOperand> &MOs) const{
    const DenseMap<unsigned*, unsigned> *OpcodeTablePtr = NULL;
    bool isTwoAddrFold = false;
    unsigned NumOps = MI->getDesc().getNumOperands();
@@ -1988,15 +2048,15 @@ X86InstrInfo::foldMemoryOperand(MachineFunction &MF,
    
    // No fusion 
    if (PrintFailedFusing)
-    cerr << "We failed to fuse operand " << i << *MI;
+    cerr << "We failed to fuse operand " << i << " in " << *MI;
    return NULL;
  }
  
  
-MachineInstr* X86InstrInfo::foldMemoryOperand(MachineFunction &MF,
-                                              MachineInstr *MI,
-                                        const SmallVectorImpl<unsigned> &Ops,
-                                              int FrameIndex) const {
+MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                                  MachineInstr *MI,
+                                                  const SmallVectorImpl<unsigned> &Ops,
+                                                  int FrameIndex) const {
    // Check switch flag 
    if (NoFusing) return NULL;
  
@@ -2037,13 +2097,13 @@ MachineInstr* X86InstrInfo::foldMemoryOperand(MachineFunction &MF,
  
    SmallVector<MachineOperand,4> MOs;
    MOs.push_back(MachineOperand::CreateFI(FrameIndex));
-  return foldMemoryOperand(MF, MI, Ops[0], MOs);
+  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs);
  }
  
-MachineInstr* X86InstrInfo::foldMemoryOperand(MachineFunction &MF,
-                                              MachineInstr *MI,
-                                        const SmallVectorImpl<unsigned> &Ops,
-                                              MachineInstr *LoadMI) const {
+MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                                  MachineInstr *MI,
+                                            const SmallVectorImpl<unsigned> &Ops,
+                                                  MachineInstr *LoadMI) const {
    // Check switch flag 
    if (NoFusing) return NULL;
  
@@ -2086,10 +2146,41 @@ MachineInstr* X86InstrInfo::foldMemoryOperand(MachineFunction &MF,
      return NULL;
  
    SmallVector<MachineOperand,4> MOs;
-  unsigned NumOps = LoadMI->getDesc().getNumOperands();
-  for (unsigned i = NumOps - 4; i != NumOps; ++i)
-    MOs.push_back(LoadMI->getOperand(i));
-  return foldMemoryOperand(MF, MI, Ops[0], MOs);
+  if (LoadMI->getOpcode() == X86::V_SET0 ||
+      LoadMI->getOpcode() == X86::V_SETALLONES) {
+    // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
+    // Create a constant-pool entry and operands to load from it.
+
+    // x86-32 PIC requires a PIC base register for constant pools.
+    unsigned PICBase = 0;
+    if (TM.getRelocationModel() == Reloc::PIC_ &&
+        !TM.getSubtarget<X86Subtarget>().is64Bit())
+      // FIXME: PICBase = TM.getInstrInfo()->getGlobalBaseReg(&MF);
+      // This doesn't work for several reasons.
+      // 1. GlobalBaseReg may have been spilled.
+      // 2. It may not be live at MI.
+      return false;
+
+    // Create a v4i32 constant-pool entry.
+    MachineConstantPool &MCP = *MF.getConstantPool();
+    const VectorType *Ty = VectorType::get(Type::Int32Ty, 4);
+    Constant *C = LoadMI->getOpcode() == X86::V_SET0 ?
+                    ConstantVector::getNullValue(Ty) :
+                    ConstantVector::getAllOnesValue(Ty);
+    unsigned CPI = MCP.getConstantPoolIndex(C, /*AlignmentLog2=*/4);
+
+    // Create operands to load from the constant pool entry.
+    MOs.push_back(MachineOperand::CreateReg(PICBase, false));
+    MOs.push_back(MachineOperand::CreateImm(1));
+    MOs.push_back(MachineOperand::CreateReg(0, false));
+    MOs.push_back(MachineOperand::CreateCPI(CPI, 0));
+  } else {
+    // Folding a normal load. Just copy the load's address operands.
+    unsigned NumOps = LoadMI->getDesc().getNumOperands();
+    for (unsigned i = NumOps - 4; i != NumOps; ++i)
+      MOs.push_back(LoadMI->getOperand(i));
+  }
+  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs);
  }
  
  
@@ -2375,10 +2466,20 @@ bool X86InstrInfo::
  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
    assert(Cond.size() == 1 && "Invalid X86 branch condition!");
    X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
+  if (CC == X86::COND_NE_OR_P || CC == X86::COND_NP_OR_E)
+    return true;
    Cond[0].setImm(GetOppositeBranchCondition(CC));
    return false;
  }
  
+bool X86InstrInfo::
+IgnoreRegisterClassBarriers(const TargetRegisterClass *RC) const {
+  // FIXME: Ignore bariers of x87 stack registers for now. We can't
+  // allow any loads of these registers before FpGet_ST0_80.
+  return RC == &X86::CCRRegClass || RC == &X86::RFP32RegClass ||
+    RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass;
+}
+
  const TargetRegisterClass *X86InstrInfo::getPointerRegClass() const {
    const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
    if (Subtarget->is64Bit())
@@ -2766,6 +2867,11 @@ static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
        FinalSize += sizeConstant(X86InstrInfo::sizeOfImm(Desc));
        break;
      }
+    case X86::TLS_tp:
+    case X86::TLS_gs_ri:
+      FinalSize += 2;
+      FinalSize += sizeGlobalAddress(false);
+      break;
      }
      CurOp = NumOps;
      break;