Change LEA to have 5 operands for its memory operand, just

[oota-llvm.git] / lib / Target / X86 / X86InstrInfo.cpp
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp

index 223ad11c6504abd282d7c74d65b2fadfb8ad317a..9f7cddcb23c44d0fc7936975fc65f2c841856095 100644 (file)
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -27,7 +27,9 @@
  #include "llvm/CodeGen/MachineRegisterInfo.h"
  #include "llvm/CodeGen/LiveVariables.h"
  #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/MC/MCInst.h"
  #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
  #include "llvm/Support/ErrorHandling.h"
  #include "llvm/Support/raw_ostream.h"
  #include "llvm/Target/TargetOptions.h"
@@ -265,6 +267,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
      { X86::MOV16rr,     X86::MOV16mr, 0, 0 },
      { X86::MOV32ri,     X86::MOV32mi, 0, 0 },
      { X86::MOV32rr,     X86::MOV32mr, 0, 0 },
+    { X86::MOV32rr_TC,  X86::MOV32mr_TC, 0, 0 },
      { X86::MOV64ri32,   X86::MOV64mi32, 0, 0 },
      { X86::MOV64rr,     X86::MOV64mr, 0, 0 },
      { X86::MOV8ri,      X86::MOV8mi, 0, 0 },
@@ -275,11 +278,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
      { X86::MOVDQArr,    X86::MOVDQAmr, 0, 16 },
      { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, 0, 0 },
      { X86::MOVPQIto64rr,X86::MOVPQI2QImr, 0, 0 },
-    { X86::MOVPS2SSrr,  X86::MOVPS2SSmr, 0, 0 },
-    { X86::MOVSDrr,     X86::MOVSDmr, 0, 0 },
      { X86::MOVSDto64rr, X86::MOVSDto64mr, 0, 0 },
      { X86::MOVSS2DIrr,  X86::MOVSS2DImr, 0, 0 },
-    { X86::MOVSSrr,     X86::MOVSSmr, 0, 0 },
      { X86::MOVUPDrr,    X86::MOVUPDmr, 0, 0 },
      { X86::MOVUPSrr,    X86::MOVUPSmr, 0, 0 },
      { X86::MUL16r,      X86::MUL16m, 1, 0 },
@@ -303,6 +303,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
      { X86::SETPr,       X86::SETPm, 0, 0 },
      { X86::SETSr,       X86::SETSm, 0, 0 },
      { X86::TAILJMPr,    X86::TAILJMPm, 1, 0 },
+    { X86::TAILJMPr64,  X86::TAILJMPm64, 1, 0 },
      { X86::TEST16ri,    X86::TEST16mi, 1, 0 },
      { X86::TEST32ri,    X86::TEST32mi, 1, 0 },
      { X86::TEST64ri32,  X86::TEST64mi32, 1, 0 },
@@ -378,6 +379,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
      { X86::Int_UCOMISSrr,   X86::Int_UCOMISSrm, 0 },
      { X86::MOV16rr,         X86::MOV16rm, 0 },
      { X86::MOV32rr,         X86::MOV32rm, 0 },
+    { X86::MOV32rr_TC,      X86::MOV32rm_TC, 0 },
      { X86::MOV64rr,         X86::MOV64rm, 0 },
      { X86::MOV64toPQIrr,    X86::MOVQI2PQIrm, 0 },
      { X86::MOV64toSDrr,     X86::MOV64toSDrm, 0 },
@@ -388,12 +390,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
      { X86::MOVDI2PDIrr,     X86::MOVDI2PDIrm, 0 },
      { X86::MOVDI2SSrr,      X86::MOVDI2SSrm, 0 },
      { X86::MOVDQArr,        X86::MOVDQArm, 16 },
-    { X86::MOVSD2PDrr,      X86::MOVSD2PDrm, 0 },
-    { X86::MOVSDrr,         X86::MOVSDrm, 0 },
      { X86::MOVSHDUPrr,      X86::MOVSHDUPrm, 16 },
      { X86::MOVSLDUPrr,      X86::MOVSLDUPrm, 16 },
-    { X86::MOVSS2PSrr,      X86::MOVSS2PSrm, 0 },
-    { X86::MOVSSrr,         X86::MOVSSrm, 0 },
      { X86::MOVSX16rr8,      X86::MOVSX16rm8, 0 },
      { X86::MOVSX32rr16,     X86::MOVSX32rm16, 0 },
      { X86::MOVSX32rr8,      X86::MOVSX32rm8, 0 },
@@ -401,7 +399,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
      { X86::MOVSX64rr32,     X86::MOVSX64rm32, 0 },
      { X86::MOVSX64rr8,      X86::MOVSX64rm8, 0 },
      { X86::MOVUPDrr,        X86::MOVUPDrm, 16 },
-    { X86::MOVUPSrr,        X86::MOVUPSrm, 16 },
+    { X86::MOVUPSrr,        X86::MOVUPSrm, 0 },
      { X86::MOVZDI2PDIrr,    X86::MOVZDI2PDIrm, 0 },
      { X86::MOVZQI2PQIrr,    X86::MOVZQI2PQIrm, 0 },
      { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, 16 },
@@ -600,7 +598,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
      { X86::PMULHUWrr,       X86::PMULHUWrm, 16 },
      { X86::PMULHWrr,        X86::PMULHWrm, 16 },
      { X86::PMULLDrr,        X86::PMULLDrm, 16 },
-    { X86::PMULLDrr_int,    X86::PMULLDrm_int, 16 },
      { X86::PMULLWrr,        X86::PMULLWrm, 16 },
      { X86::PMULUDQrr,       X86::PMULUDQrm, 16 },
      { X86::PORrr,           X86::PORrm, 16 },
@@ -681,23 +678,22 @@ bool X86InstrInfo::isMoveInstr(const MachineInstr& MI,
    case X86::MOV16rr:
    case X86::MOV32rr: 
    case X86::MOV64rr:
-  case X86::MOVSSrr:
-  case X86::MOVSDrr:
+  case X86::MOV32rr_TC: 
+  case X86::MOV64rr_TC:
  
    // FP Stack register class copies
    case X86::MOV_Fp3232: case X86::MOV_Fp6464: case X86::MOV_Fp8080:
    case X86::MOV_Fp3264: case X86::MOV_Fp3280:
    case X86::MOV_Fp6432: case X86::MOV_Fp8032:
-      
+
+  // Note that MOVSSrr and MOVSDrr are not considered copies. FR32 and FR64
+  // copies are done with FsMOVAPSrr and FsMOVAPDrr.
+
    case X86::FsMOVAPSrr:
    case X86::FsMOVAPDrr:
    case X86::MOVAPSrr:
    case X86::MOVAPDrr:
    case X86::MOVDQArr:
-  case X86::MOVSS2PSrr:
-  case X86::MOVSD2PDrr:
-  case X86::MOVPS2SSrr:
-  case X86::MOVPD2SDrr:
    case X86::MMX_MOVQ64rr:
      assert(MI.getNumOperands() >= 2 &&
             MI.getOperand(0).isReg() &&
@@ -711,6 +707,62 @@ bool X86InstrInfo::isMoveInstr(const MachineInstr& MI,
    }
  }
  
+bool
+X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+                                    unsigned &SrcReg, unsigned &DstReg,
+                                    unsigned &SubIdx) const {
+  switch (MI.getOpcode()) {
+  default: break;
+  case X86::MOVSX16rr8:
+  case X86::MOVZX16rr8:
+  case X86::MOVSX32rr8:
+  case X86::MOVZX32rr8:
+  case X86::MOVSX64rr8:
+  case X86::MOVZX64rr8:
+    if (!TM.getSubtarget<X86Subtarget>().is64Bit())
+      // It's not always legal to reference the low 8-bit of the larger
+      // register in 32-bit mode.
+      return false;
+  case X86::MOVSX32rr16:
+  case X86::MOVZX32rr16:
+  case X86::MOVSX64rr16:
+  case X86::MOVZX64rr16:
+  case X86::MOVSX64rr32:
+  case X86::MOVZX64rr32: {
+    if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
+      // Be conservative.
+      return false;
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    switch (MI.getOpcode()) {
+    default:
+      llvm_unreachable(0);
+      break;
+    case X86::MOVSX16rr8:
+    case X86::MOVZX16rr8:
+    case X86::MOVSX32rr8:
+    case X86::MOVZX32rr8:
+    case X86::MOVSX64rr8:
+    case X86::MOVZX64rr8:
+      SubIdx = X86::sub_8bit;
+      break;
+    case X86::MOVSX32rr16:
+    case X86::MOVZX32rr16:
+    case X86::MOVSX64rr16:
+    case X86::MOVZX64rr16:
+      SubIdx = X86::sub_16bit;
+      break;
+    case X86::MOVSX64rr32:
+    case X86::MOVZX64rr32:
+      SubIdx = X86::sub_32bit;
+      break;
+    }
+    return true;
+  }
+  }
+  return false;
+}
+
  /// isFrameOperand - Return true and the FrameIndex if the specified
  /// operand and follow operands form a reference to the stack frame.
  bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op,
@@ -811,7 +863,7 @@ unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
                                            int &FrameIndex) const {
    if (isFrameStoreOpcode(MI->getOpcode()))
      if (isFrameOperand(MI, 0, FrameIndex))
-      return MI->getOperand(X86AddrNumOperands).getReg();
+      return MI->getOperand(X86::AddrNumOperands).getReg();
    return 0;
  }
  
@@ -940,8 +992,10 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
  /// a few instructions in each direction it assumes it's not safe.
  static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator I) {
+  MachineBasicBlock::iterator E = MBB.end();
+
    // It's always safe to clobber EFLAGS at the end of a block.
-  if (I == MBB.end())
+  if (I == E)
      return true;
  
    // For compile time consideration, if we are not able to determine the
@@ -965,20 +1019,28 @@ static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
        // This instruction defines EFLAGS, no need to look any further.
        return true;
      ++Iter;
+    // Skip over DBG_VALUE.
+    while (Iter != E && Iter->isDebugValue())
+      ++Iter;
  
      // If we make it to the end of the block, it's safe to clobber EFLAGS.
-    if (Iter == MBB.end())
+    if (Iter == E)
        return true;
    }
  
+  MachineBasicBlock::iterator B = MBB.begin();
    Iter = I;
    for (unsigned i = 0; i < 4; ++i) {
      // If we make it to the beginning of the block, it's safe to clobber
      // EFLAGS iff EFLAGS is not live-in.
-    if (Iter == MBB.begin())
+    if (Iter == B)
        return !MBB.isLiveIn(X86::EFLAGS);
  
      --Iter;
+    // Skip over DBG_VALUE.
+    while (Iter != B && Iter->isDebugValue())
+      --Iter;
+
      bool SawKill = false;
      for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
        MachineOperand &MO = Iter->getOperand(j);
@@ -1002,14 +1064,8 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator I,
                                   unsigned DestReg, unsigned SubIdx,
                                   const MachineInstr *Orig,
-                                 const TargetRegisterInfo *TRI) const {
-  DebugLoc DL = DebugLoc::getUnknownLoc();
-  if (I != MBB.end()) DL = I->getDebugLoc();
-
-  if (SubIdx && TargetRegisterInfo::isPhysicalRegister(DestReg)) {
-    DestReg = TRI->getSubReg(DestReg, SubIdx);
-    SubIdx = 0;
-  }
+                                 const TargetRegisterInfo &TRI) const {
+  DebugLoc DL = Orig->getDebugLoc();
  
    // MOV32r0 etc. are implemented with xor which clobbers condition code.
    // Re-materialize them as movri instructions to avoid side effects.
@@ -1018,12 +1074,16 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
    switch (Opc) {
    default: break;
    case X86::MOV8r0:
-  case X86::MOV32r0: {
+  case X86::MOV16r0:
+  case X86::MOV32r0:
+  case X86::MOV64r0: {
      if (!isSafeToClobberEFLAGS(MBB, I)) {
        switch (Opc) {
        default: break;
        case X86::MOV8r0:  Opc = X86::MOV8ri;  break;
+      case X86::MOV16r0: Opc = X86::MOV16ri; break;
        case X86::MOV32r0: Opc = X86::MOV32ri; break;
+      case X86::MOV64r0: Opc = X86::MOV64ri64i32; break;
        }
        Clone = false;
      }
@@ -1033,14 +1093,13 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
  
    if (Clone) {
      MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
-    MI->getOperand(0).setReg(DestReg);
      MBB.insert(I, MI);
    } else {
-    BuildMI(MBB, I, DL, get(Opc), DestReg).addImm(0);
+    BuildMI(MBB, I, DL, get(Opc)).addOperand(Orig->getOperand(0)).addImm(0);
    }
  
    MachineInstr *NewMI = prior(I);
-  NewMI->getOperand(0).setSubReg(SubIdx);
+  NewMI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI);
  }
  
  /// hasLiveCondCodeDef - True if MI has a condition code def, e.g. EFLAGS, that
@@ -1086,10 +1145,9 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
    // least on modern x86 machines).
    BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg);
    MachineInstr *InsMI =
-    BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::INSERT_SUBREG),leaInReg)
-    .addReg(leaInReg)
-    .addReg(Src, getKillRegState(isKill))
-    .addImm(X86::SUBREG_16BIT);
+    BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY))
+    .addReg(leaInReg, RegState::Define, X86::sub_16bit)
+    .addReg(Src, getKillRegState(isKill));
  
    MachineInstrBuilder MIB = BuildMI(*MFI, MBBI, MI->getDebugLoc(),
                                      get(Opc), leaOutReg);
@@ -1100,20 +1158,20 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
    case X86::SHL16ri: {
      unsigned ShAmt = MI->getOperand(2).getImm();
      MIB.addReg(0).addImm(1 << ShAmt)
-       .addReg(leaInReg, RegState::Kill).addImm(0);
+       .addReg(leaInReg, RegState::Kill).addImm(0).addReg(0);
      break;
    }
    case X86::INC16r:
    case X86::INC64_16r:
-    addLeaRegOffset(MIB, leaInReg, true, 1);
+    addRegOffset(MIB, leaInReg, true, 1);
      break;
    case X86::DEC16r:
    case X86::DEC64_16r:
-    addLeaRegOffset(MIB, leaInReg, true, -1);
+    addRegOffset(MIB, leaInReg, true, -1);
      break;
    case X86::ADD16ri:
    case X86::ADD16ri8:
-    addLeaRegOffset(MIB, leaInReg, true, MI->getOperand(2).getImm());    
+    addRegOffset(MIB, leaInReg, true, MI->getOperand(2).getImm());    
      break;
    case X86::ADD16rr: {
      unsigned Src2 = MI->getOperand(2).getReg();
@@ -1130,10 +1188,9 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
        // well be shifting and then extracting the lower 16-bits. 
        BuildMI(*MFI, MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg2);
        InsMI2 =
-        BuildMI(*MFI, MIB, MI->getDebugLoc(), get(X86::INSERT_SUBREG),leaInReg2)
-        .addReg(leaInReg2)
-        .addReg(Src2, getKillRegState(isKill2))
-        .addImm(X86::SUBREG_16BIT);
+        BuildMI(*MFI, MIB, MI->getDebugLoc(), get(TargetOpcode::COPY))
+        .addReg(leaInReg2, RegState::Define, X86::sub_16bit)
+        .addReg(Src2, getKillRegState(isKill2));
        addRegReg(MIB, leaInReg, true, leaInReg2, true);
      }
      if (LV && isKill2 && InsMI2)
@@ -1144,10 +1201,9 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
  
    MachineInstr *NewMI = MIB;
    MachineInstr *ExtMI =
-    BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::EXTRACT_SUBREG))
+    BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY))
      .addReg(Dest, RegState::Define | getDeadRegState(isDead))
-    .addReg(leaOutReg, RegState::Kill)
-    .addImm(X86::SUBREG_16BIT);
+    .addReg(leaOutReg, RegState::Kill, X86::sub_16bit);
  
    if (LV) {
      // Update live variables
@@ -1218,7 +1274,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
        .addReg(Dest, RegState::Define | getDeadRegState(isDead))
        .addReg(0).addImm(1 << ShAmt)
        .addReg(Src, getKillRegState(isKill))
-      .addImm(0);
+      .addImm(0).addReg(0);
      break;
    }
    case X86::SHL32ri: {
@@ -1232,7 +1288,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
      NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc))
        .addReg(Dest, RegState::Define | getDeadRegState(isDead))
        .addReg(0).addImm(1 << ShAmt)
-      .addReg(Src, getKillRegState(isKill)).addImm(0);
+      .addReg(Src, getKillRegState(isKill)).addImm(0).addReg(0);
      break;
    }
    case X86::SHL16ri: {
@@ -1248,7 +1304,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
        .addReg(Dest, RegState::Define | getDeadRegState(isDead))
        .addReg(0).addImm(1 << ShAmt)
        .addReg(Src, getKillRegState(isKill))
-      .addImm(0);
+      .addImm(0).addReg(0);
      break;
    }
    default: {
@@ -1266,7 +1322,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
        assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
        unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
          : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
-      NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
                                .addReg(Dest, RegState::Define |
                                        getDeadRegState(isDead)),
                                Src, isKill, 1);
@@ -1288,7 +1344,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
        assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
        unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
          : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
-      NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
                                .addReg(Dest, RegState::Define |
                                        getDeadRegState(isDead)),
                                Src, isKill, -1);
@@ -1336,7 +1392,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
      case X86::ADD64ri32:
      case X86::ADD64ri8:
        assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
-      NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
+      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
                                .addReg(Dest, RegState::Define |
                                        getDeadRegState(isDead)),
                                Src, isKill, MI->getOperand(2).getImm());
@@ -1345,7 +1401,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
      case X86::ADD32ri8: {
        assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
        unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
-      NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
                                .addReg(Dest, RegState::Define |
                                        getDeadRegState(isDead)),
                                  Src, isKill, MI->getOperand(2).getImm());
@@ -1356,7 +1412,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
        if (DisableLEA16)
          return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
        assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
-      NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
                                .addReg(Dest, RegState::Define |
                                        getDeadRegState(isDead)),
                                Src, isKill, MI->getOperand(2).getImm());
@@ -1527,44 +1583,44 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
  static X86::CondCode GetCondFromBranchOpc(unsigned BrOpc) {
    switch (BrOpc) {
    default: return X86::COND_INVALID;
-  case X86::JE:  return X86::COND_E;
-  case X86::JNE: return X86::COND_NE;
-  case X86::JL:  return X86::COND_L;
-  case X86::JLE: return X86::COND_LE;
-  case X86::JG:  return X86::COND_G;
-  case X86::JGE: return X86::COND_GE;
-  case X86::JB:  return X86::COND_B;
-  case X86::JBE: return X86::COND_BE;
-  case X86::JA:  return X86::COND_A;
-  case X86::JAE: return X86::COND_AE;
-  case X86::JS:  return X86::COND_S;
-  case X86::JNS: return X86::COND_NS;
-  case X86::JP:  return X86::COND_P;
-  case X86::JNP: return X86::COND_NP;
-  case X86::JO:  return X86::COND_O;
-  case X86::JNO: return X86::COND_NO;
+  case X86::JE_4:  return X86::COND_E;
+  case X86::JNE_4: return X86::COND_NE;
+  case X86::JL_4:  return X86::COND_L;
+  case X86::JLE_4: return X86::COND_LE;
+  case X86::JG_4:  return X86::COND_G;
+  case X86::JGE_4: return X86::COND_GE;
+  case X86::JB_4:  return X86::COND_B;
+  case X86::JBE_4: return X86::COND_BE;
+  case X86::JA_4:  return X86::COND_A;
+  case X86::JAE_4: return X86::COND_AE;
+  case X86::JS_4:  return X86::COND_S;
+  case X86::JNS_4: return X86::COND_NS;
+  case X86::JP_4:  return X86::COND_P;
+  case X86::JNP_4: return X86::COND_NP;
+  case X86::JO_4:  return X86::COND_O;
+  case X86::JNO_4: return X86::COND_NO;
    }
  }
  
  unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
    switch (CC) {
    default: llvm_unreachable("Illegal condition code!");
-  case X86::COND_E:  return X86::JE;
-  case X86::COND_NE: return X86::JNE;
-  case X86::COND_L:  return X86::JL;
-  case X86::COND_LE: return X86::JLE;
-  case X86::COND_G:  return X86::JG;
-  case X86::COND_GE: return X86::JGE;
-  case X86::COND_B:  return X86::JB;
-  case X86::COND_BE: return X86::JBE;
-  case X86::COND_A:  return X86::JA;
-  case X86::COND_AE: return X86::JAE;
-  case X86::COND_S:  return X86::JS;
-  case X86::COND_NS: return X86::JNS;
-  case X86::COND_P:  return X86::JP;
-  case X86::COND_NP: return X86::JNP;
-  case X86::COND_O:  return X86::JO;
-  case X86::COND_NO: return X86::JNO;
+  case X86::COND_E:  return X86::JE_4;
+  case X86::COND_NE: return X86::JNE_4;
+  case X86::COND_L:  return X86::JL_4;
+  case X86::COND_LE: return X86::JLE_4;
+  case X86::COND_G:  return X86::JG_4;
+  case X86::COND_GE: return X86::JGE_4;
+  case X86::COND_B:  return X86::JB_4;
+  case X86::COND_BE: return X86::JBE_4;
+  case X86::COND_A:  return X86::JA_4;
+  case X86::COND_AE: return X86::JAE_4;
+  case X86::COND_S:  return X86::JS_4;
+  case X86::COND_NS: return X86::JNS_4;
+  case X86::COND_P:  return X86::JP_4;
+  case X86::COND_NP: return X86::JNP_4;
+  case X86::COND_O:  return X86::JO_4;
+  case X86::COND_NO: return X86::JNO_4;
    }
  }
  
@@ -1620,8 +1676,11 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
    // Start from the bottom of the block and work up, examining the
    // terminator instructions.
    MachineBasicBlock::iterator I = MBB.end();
+  MachineBasicBlock::iterator UnCondBrIter = MBB.end();
    while (I != MBB.begin()) {
      --I;
+    if (I->isDebugValue())
+      continue;
  
      // Working from the bottom, when we see a non-terminator instruction, we're
      // done.
@@ -1634,7 +1693,9 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
        return true;
  
      // Handle unconditional branches.
-    if (I->getOpcode() == X86::JMP) {
+    if (I->getOpcode() == X86::JMP_4) {
+      UnCondBrIter = I;
+
        if (!AllowModify) {
          TBB = I->getOperand(0).getMBB();
          continue;
@@ -1652,10 +1713,11 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
          TBB = 0;
          I->eraseFromParent();
          I = MBB.end();
+        UnCondBrIter = MBB.end();
          continue;
        }
  
-      // TBB is used to indicate the unconditinal destination.
+      // TBB is used to indicate the unconditional destination.
        TBB = I->getOperand(0).getMBB();
        continue;
      }
@@ -1667,6 +1729,45 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
  
      // Working from the bottom, handle the first conditional branch.
      if (Cond.empty()) {
+      MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
+      if (AllowModify && UnCondBrIter != MBB.end() &&
+          MBB.isLayoutSuccessor(TargetBB)) {
+        // If we can modify the code and it ends in something like:
+        //
+        //     jCC L1
+        //     jmp L2
+        //   L1:
+        //     ...
+        //   L2:
+        //
+        // Then we can change this to:
+        //
+        //     jnCC L2
+        //   L1:
+        //     ...
+        //   L2:
+        //
+        // Which is a bit more efficient.
+        // We conditionally jump to the fall-through block.
+        BranchCode = GetOppositeBranchCondition(BranchCode);
+        unsigned JNCC = GetCondBranchFromCond(BranchCode);
+        MachineBasicBlock::iterator OldInst = I;
+
+        BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC))
+          .addMBB(UnCondBrIter->getOperand(0).getMBB());
+        BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_4))
+          .addMBB(TargetBB);
+        MBB.addSuccessor(TargetBB);
+
+        OldInst->eraseFromParent();
+        UnCondBrIter->eraseFromParent();
+
+        // Restart the analysis.
+        UnCondBrIter = MBB.end();
+        I = MBB.end();
+        continue;
+      }
+
        FBB = TBB;
        TBB = I->getOperand(0).getMBB();
        Cond.push_back(MachineOperand::CreateImm(BranchCode));
@@ -1718,7 +1819,9 @@ unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
  
    while (I != MBB.begin()) {
      --I;
-    if (I->getOpcode() != X86::JMP &&
+    if (I->isDebugValue())
+      continue;
+    if (I->getOpcode() != X86::JMP_4 &&
          GetCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
        break;
      // Remove the branch.
@@ -1733,9 +1836,8 @@ unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
  unsigned
  X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                             MachineBasicBlock *FBB,
-                           const SmallVectorImpl<MachineOperand> &Cond) const {
-  // FIXME this should probably have a DebugLoc operand
-  DebugLoc dl = DebugLoc::getUnknownLoc();
+                           const SmallVectorImpl<MachineOperand> &Cond,
+                           DebugLoc DL) const {
    // Shouldn't be a fall through.
    assert(TBB && "InsertBranch must not be told to insert a fallthrough");
    assert((Cond.size() == 1 || Cond.size() == 0) &&
@@ -1744,7 +1846,7 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
    if (Cond.empty()) {
      // Unconditional branch?
      assert(!FBB && "Unconditional branch with multiple successors!");
-    BuildMI(&MBB, dl, get(X86::JMP)).addMBB(TBB);
+    BuildMI(&MBB, DL, get(X86::JMP_4)).addMBB(TBB);
      return 1;
    }
  
@@ -1754,27 +1856,27 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
    switch (CC) {
    case X86::COND_NP_OR_E:
      // Synthesize NP_OR_E with two branches.
-    BuildMI(&MBB, dl, get(X86::JNP)).addMBB(TBB);
+    BuildMI(&MBB, DL, get(X86::JNP_4)).addMBB(TBB);
      ++Count;
-    BuildMI(&MBB, dl, get(X86::JE)).addMBB(TBB);
+    BuildMI(&MBB, DL, get(X86::JE_4)).addMBB(TBB);
      ++Count;
      break;
    case X86::COND_NE_OR_P:
      // Synthesize NE_OR_P with two branches.
-    BuildMI(&MBB, dl, get(X86::JNE)).addMBB(TBB);
+    BuildMI(&MBB, DL, get(X86::JNE_4)).addMBB(TBB);
      ++Count;
-    BuildMI(&MBB, dl, get(X86::JP)).addMBB(TBB);
+    BuildMI(&MBB, DL, get(X86::JP_4)).addMBB(TBB);
      ++Count;
      break;
    default: {
      unsigned Opc = GetCondBranchFromCond(CC);
-    BuildMI(&MBB, dl, get(Opc)).addMBB(TBB);
+    BuildMI(&MBB, DL, get(Opc)).addMBB(TBB);
      ++Count;
    }
    }
    if (FBB) {
      // Two-way Conditional branch. Insert the second branch.
-    BuildMI(&MBB, dl, get(X86::JMP)).addMBB(FBB);
+    BuildMI(&MBB, DL, get(X86::JMP_4)).addMBB(FBB);
      ++Count;
    }
    return Count;
@@ -1789,9 +1891,8 @@ bool X86InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
                                  unsigned DestReg, unsigned SrcReg,
                                  const TargetRegisterClass *DestRC,
-                                const TargetRegisterClass *SrcRC) const {
-  DebugLoc DL = DebugLoc::getUnknownLoc();
-  if (MI != MBB.end()) DL = MI->getDebugLoc();
+                                const TargetRegisterClass *SrcRC,
+                                DebugLoc DL) const {
  
    // Determine if DstRC and SrcRC have a common superclass in common.
    const TargetRegisterClass *CommonRC = DestRC;
@@ -1801,7 +1902,7 @@ bool X86InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
      CommonRC = SrcRC;
    else if (!DestRC->hasSubClass(SrcRC)) {
      // Neither of GR64_NOREX or GR64_NOSP is a superclass of the other,
-    // but we want to copy then as GR64. Similarly, for GR32_NOREX and
+    // but we want to copy them as GR64. Similarly, for GR32_NOREX and
      // GR32_NOSP, copy as GR32.
      if (SrcRC->hasSuperClass(&X86::GR64RegClass) &&
          DestRC->hasSuperClass(&X86::GR64RegClass))
@@ -1809,6 +1910,9 @@ bool X86InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
      else if (SrcRC->hasSuperClass(&X86::GR32RegClass) &&
               DestRC->hasSuperClass(&X86::GR32RegClass))
        CommonRC = &X86::GR32RegClass;
+    else if (SrcRC->hasSuperClass(&X86::GR8RegClass) &&
+             DestRC->hasSuperClass(&X86::GR8RegClass))
+      CommonRC = &X86::GR8RegClass;
      else
        CommonRC = 0;
    }
@@ -1825,7 +1929,9 @@ bool X86InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
      } else if (CommonRC == &X86::GR8RegClass) {
        // Copying to or from a physical H register on x86-64 requires a NOREX
        // move.  Otherwise use a normal move.
-      if ((isHReg(DestReg) || isHReg(SrcReg)) &&
+      if ((isHReg(DestReg) || isHReg(SrcReg) ||
+           SrcRC == &X86::GR8_ABCD_HRegClass ||
+           DestRC == &X86::GR8_ABCD_HRegClass) &&
            TM.getSubtarget<X86Subtarget>().is64Bit())
          Opc = X86::MOV8rr_NOREX;
        else
@@ -1852,6 +1958,10 @@ bool X86InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
        Opc = X86::MOV16rr;
      } else if (CommonRC == &X86::GR8_NOREXRegClass) {
        Opc = X86::MOV8rr;
+    } else if (CommonRC == &X86::GR64_TCRegClass) {
+      Opc = X86::MOV64rr_TC;
+    } else if (CommonRC == &X86::GR32_TCRegClass) {
+      Opc = X86::MOV32rr_TC;
      } else if (CommonRC == &X86::RFP32RegClass) {
        Opc = X86::MOV_Fp3232;
      } else if (CommonRC == &X86::RFP64RegClass || CommonRC == &X86::RSTRegClass) {
@@ -1878,12 +1988,12 @@ bool X86InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
      if (SrcReg != X86::EFLAGS)
        return false;
      if (DestRC == &X86::GR64RegClass || DestRC == &X86::GR64_NOSPRegClass) {
-      BuildMI(MBB, MI, DL, get(X86::PUSHFQ64));
+      BuildMI(MBB, MI, DL, get(X86::PUSHF64));
        BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg);
        return true;
      } else if (DestRC == &X86::GR32RegClass ||
                 DestRC == &X86::GR32_NOSPRegClass) {
-      BuildMI(MBB, MI, DL, get(X86::PUSHFD));
+      BuildMI(MBB, MI, DL, get(X86::PUSHF32));
        BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg);
        return true;
      }
@@ -1892,12 +2002,12 @@ bool X86InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
        return false;
      if (SrcRC == &X86::GR64RegClass || DestRC == &X86::GR64_NOSPRegClass) {
        BuildMI(MBB, MI, DL, get(X86::PUSH64r)).addReg(SrcReg);
-      BuildMI(MBB, MI, DL, get(X86::POPFQ));
+      BuildMI(MBB, MI, DL, get(X86::POPF64));
        return true;
      } else if (SrcRC == &X86::GR32RegClass ||
                 DestRC == &X86::GR32_NOSPRegClass) {
        BuildMI(MBB, MI, DL, get(X86::PUSH32r)).addReg(SrcReg);
-      BuildMI(MBB, MI, DL, get(X86::POPFD));
+      BuildMI(MBB, MI, DL, get(X86::POPF32));
        return true;
      }
    }
@@ -1948,79 +2058,160 @@ bool X86InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
    return false;
  }
  
-static unsigned getStoreRegOpcode(unsigned SrcReg,
-                                  const TargetRegisterClass *RC,
-                                  bool isStackAligned,
-                                  TargetMachine &TM) {
+void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MI, DebugLoc DL,
+                               unsigned DestReg, unsigned SrcReg,
+                               bool KillSrc) const {
+  // First deal with the normal symmetric copies.
    unsigned Opc = 0;
+  if (X86::GR64RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MOV64rr;
+  else if (X86::GR32RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MOV32rr;
+  else if (X86::GR16RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MOV16rr;
+  else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
+    // Copying to or from a physical H register on x86-64 requires a NOREX
+    // move.  Otherwise use a normal move.
+    if ((isHReg(DestReg) || isHReg(SrcReg)) &&
+        TM.getSubtarget<X86Subtarget>().is64Bit())
+      Opc = X86::MOV8rr_NOREX;
+    else
+      Opc = X86::MOV8rr;
+  } else if (X86::VR128RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MOVAPSrr;
+  else if (X86::VR64RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MMX_MOVQ64rr;
+
+  if (Opc) {
+    BuildMI(MBB, MI, DL, get(Opc), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  // Moving EFLAGS to / from another register requires a push and a pop.
+  if (SrcReg == X86::EFLAGS) {
+    if (X86::GR64RegClass.contains(DestReg)) {
+      BuildMI(MBB, MI, DL, get(X86::PUSHF64));
+      BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg);
+      return;
+    } else if (X86::GR32RegClass.contains(DestReg)) {
+      BuildMI(MBB, MI, DL, get(X86::PUSHF32));
+      BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg);
+      return;
+    }
+  }
+  if (DestReg == X86::EFLAGS) {
+    if (X86::GR64RegClass.contains(SrcReg)) {
+      BuildMI(MBB, MI, DL, get(X86::PUSH64r))
+        .addReg(SrcReg, getKillRegState(KillSrc));
+      BuildMI(MBB, MI, DL, get(X86::POPF64));
+      return;
+    } else if (X86::GR32RegClass.contains(SrcReg)) {
+      BuildMI(MBB, MI, DL, get(X86::PUSH32r))
+        .addReg(SrcReg, getKillRegState(KillSrc));
+      BuildMI(MBB, MI, DL, get(X86::POPF32));
+      return;
+    }
+  }
+
+  DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg)
+               << " to " << RI.getName(DestReg) << '\n');
+  llvm_unreachable("Cannot emit physreg copy instruction");
+}
+
+static unsigned getLoadStoreRegOpcode(unsigned Reg,
+                                      const TargetRegisterClass *RC,
+                                      bool isStackAligned,
+                                      const TargetMachine &TM,
+                                      bool load) {
    if (RC == &X86::GR64RegClass || RC == &X86::GR64_NOSPRegClass) {
-    Opc = X86::MOV64mr;
+    return load ? X86::MOV64rm : X86::MOV64mr;
    } else if (RC == &X86::GR32RegClass || RC == &X86::GR32_NOSPRegClass) {
-    Opc = X86::MOV32mr;
+    return load ? X86::MOV32rm : X86::MOV32mr;
    } else if (RC == &X86::GR16RegClass) {
-    Opc = X86::MOV16mr;
+    return load ? X86::MOV16rm : X86::MOV16mr;
    } else if (RC == &X86::GR8RegClass) {
      // Copying to or from a physical H register on x86-64 requires a NOREX
      // move.  Otherwise use a normal move.
-    if (isHReg(SrcReg) &&
+    if (isHReg(Reg) &&
          TM.getSubtarget<X86Subtarget>().is64Bit())
-      Opc = X86::MOV8mr_NOREX;
+      return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
      else
-      Opc = X86::MOV8mr;
+      return load ? X86::MOV8rm : X86::MOV8mr;
    } else if (RC == &X86::GR64_ABCDRegClass) {
-    Opc = X86::MOV64mr;
+    return load ? X86::MOV64rm : X86::MOV64mr;
    } else if (RC == &X86::GR32_ABCDRegClass) {
-    Opc = X86::MOV32mr;
+    return load ? X86::MOV32rm : X86::MOV32mr;
    } else if (RC == &X86::GR16_ABCDRegClass) {
-    Opc = X86::MOV16mr;
+    return load ? X86::MOV16rm : X86::MOV16mr;
    } else if (RC == &X86::GR8_ABCD_LRegClass) {
-    Opc = X86::MOV8mr;
+    return load ? X86::MOV8rm :X86::MOV8mr;
    } else if (RC == &X86::GR8_ABCD_HRegClass) {
      if (TM.getSubtarget<X86Subtarget>().is64Bit())
-      Opc = X86::MOV8mr_NOREX;
+      return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
      else
-      Opc = X86::MOV8mr;
+      return load ? X86::MOV8rm : X86::MOV8mr;
    } else if (RC == &X86::GR64_NOREXRegClass ||
               RC == &X86::GR64_NOREX_NOSPRegClass) {
-    Opc = X86::MOV64mr;
+    return load ? X86::MOV64rm : X86::MOV64mr;
    } else if (RC == &X86::GR32_NOREXRegClass) {
-    Opc = X86::MOV32mr;
+    return load ? X86::MOV32rm : X86::MOV32mr;
    } else if (RC == &X86::GR16_NOREXRegClass) {
-    Opc = X86::MOV16mr;
+    return load ? X86::MOV16rm : X86::MOV16mr;
    } else if (RC == &X86::GR8_NOREXRegClass) {
-    Opc = X86::MOV8mr;
+    return load ? X86::MOV8rm : X86::MOV8mr;
+  } else if (RC == &X86::GR64_TCRegClass) {
+    return load ? X86::MOV64rm_TC : X86::MOV64mr_TC;
+  } else if (RC == &X86::GR32_TCRegClass) {
+    return load ? X86::MOV32rm_TC : X86::MOV32mr_TC;
    } else if (RC == &X86::RFP80RegClass) {
-    Opc = X86::ST_FpP80m;   // pops
+    return load ? X86::LD_Fp80m : X86::ST_FpP80m;
    } else if (RC == &X86::RFP64RegClass) {
-    Opc = X86::ST_Fp64m;
+    return load ? X86::LD_Fp64m : X86::ST_Fp64m;
    } else if (RC == &X86::RFP32RegClass) {
-    Opc = X86::ST_Fp32m;
+    return load ? X86::LD_Fp32m : X86::ST_Fp32m;
    } else if (RC == &X86::FR32RegClass) {
-    Opc = X86::MOVSSmr;
+    return load ? X86::MOVSSrm : X86::MOVSSmr;
    } else if (RC == &X86::FR64RegClass) {
-    Opc = X86::MOVSDmr;
+    return load ? X86::MOVSDrm : X86::MOVSDmr;
    } else if (RC == &X86::VR128RegClass) {
      // If stack is realigned we can use aligned stores.
-    Opc = isStackAligned ? X86::MOVAPSmr : X86::MOVUPSmr;
+    if (isStackAligned)
+      return load ? X86::MOVAPSrm : X86::MOVAPSmr;
+    else
+      return load ? X86::MOVUPSrm : X86::MOVUPSmr;
    } else if (RC == &X86::VR64RegClass) {
-    Opc = X86::MMX_MOVQ64mr;
+    return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
    } else {
      llvm_unreachable("Unknown regclass");
    }
+}
+
+static unsigned getStoreRegOpcode(unsigned SrcReg,
+                                  const TargetRegisterClass *RC,
+                                  bool isStackAligned,
+                                  TargetMachine &TM) {
+  return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, TM, false);
+}
+
  
-  return Opc;
+static unsigned getLoadRegOpcode(unsigned DestReg,
+                                 const TargetRegisterClass *RC,
+                                 bool isStackAligned,
+                                 const TargetMachine &TM) {
+  return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, TM, true);
  }
  
  void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator MI,
                                         unsigned SrcReg, bool isKill, int FrameIdx,
-                                       const TargetRegisterClass *RC) const {
+                                       const TargetRegisterClass *RC,
+                                       const TargetRegisterInfo *TRI) const {
    const MachineFunction &MF = *MBB.getParent();
-  bool isAligned = (RI.getStackAlignment() >= 16) ||
-    RI.needsStackRealignment(MF);
+  bool isAligned = (RI.getStackAlignment() >= 16) || RI.canRealignStack(MF);
    unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
-  DebugLoc DL = DebugLoc::getUnknownLoc();
-  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  DebugLoc DL = MBB.findDebugLoc(MI);
    addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx)
      .addReg(SrcReg, getKillRegState(isKill));
  }
@@ -2032,9 +2223,9 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
                                    MachineInstr::mmo_iterator MMOBegin,
                                    MachineInstr::mmo_iterator MMOEnd,
                                    SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  bool isAligned = (*MMOBegin)->getAlignment() >= 16;
+  bool isAligned = *MMOBegin && (*MMOBegin)->getAlignment() >= 16;
    unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
-  DebugLoc DL = DebugLoc::getUnknownLoc();
+  DebugLoc DL;
    MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
    for (unsigned i = 0, e = Addr.size(); i != e; ++i)
      MIB.addOperand(Addr[i]);
@@ -2043,79 +2234,16 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
    NewMIs.push_back(MIB);
  }
  
-static unsigned getLoadRegOpcode(unsigned DestReg,
-                                 const TargetRegisterClass *RC,
-                                 bool isStackAligned,
-                                 const TargetMachine &TM) {
-  unsigned Opc = 0;
-  if (RC == &X86::GR64RegClass || RC == &X86::GR64_NOSPRegClass) {
-    Opc = X86::MOV64rm;
-  } else if (RC == &X86::GR32RegClass || RC == &X86::GR32_NOSPRegClass) {
-    Opc = X86::MOV32rm;
-  } else if (RC == &X86::GR16RegClass) {
-    Opc = X86::MOV16rm;
-  } else if (RC == &X86::GR8RegClass) {
-    // Copying to or from a physical H register on x86-64 requires a NOREX
-    // move.  Otherwise use a normal move.
-    if (isHReg(DestReg) &&
-        TM.getSubtarget<X86Subtarget>().is64Bit())
-      Opc = X86::MOV8rm_NOREX;
-    else
-      Opc = X86::MOV8rm;
-  } else if (RC == &X86::GR64_ABCDRegClass) {
-    Opc = X86::MOV64rm;
-  } else if (RC == &X86::GR32_ABCDRegClass) {
-    Opc = X86::MOV32rm;
-  } else if (RC == &X86::GR16_ABCDRegClass) {
-    Opc = X86::MOV16rm;
-  } else if (RC == &X86::GR8_ABCD_LRegClass) {
-    Opc = X86::MOV8rm;
-  } else if (RC == &X86::GR8_ABCD_HRegClass) {
-    if (TM.getSubtarget<X86Subtarget>().is64Bit())
-      Opc = X86::MOV8rm_NOREX;
-    else
-      Opc = X86::MOV8rm;
-  } else if (RC == &X86::GR64_NOREXRegClass ||
-             RC == &X86::GR64_NOREX_NOSPRegClass) {
-    Opc = X86::MOV64rm;
-  } else if (RC == &X86::GR32_NOREXRegClass) {
-    Opc = X86::MOV32rm;
-  } else if (RC == &X86::GR16_NOREXRegClass) {
-    Opc = X86::MOV16rm;
-  } else if (RC == &X86::GR8_NOREXRegClass) {
-    Opc = X86::MOV8rm;
-  } else if (RC == &X86::RFP80RegClass) {
-    Opc = X86::LD_Fp80m;
-  } else if (RC == &X86::RFP64RegClass) {
-    Opc = X86::LD_Fp64m;
-  } else if (RC == &X86::RFP32RegClass) {
-    Opc = X86::LD_Fp32m;
-  } else if (RC == &X86::FR32RegClass) {
-    Opc = X86::MOVSSrm;
-  } else if (RC == &X86::FR64RegClass) {
-    Opc = X86::MOVSDrm;
-  } else if (RC == &X86::VR128RegClass) {
-    // If stack is realigned we can use aligned loads.
-    Opc = isStackAligned ? X86::MOVAPSrm : X86::MOVUPSrm;
-  } else if (RC == &X86::VR64RegClass) {
-    Opc = X86::MMX_MOVQ64rm;
-  } else {
-    llvm_unreachable("Unknown regclass");
-  }
-
-  return Opc;
-}
  
  void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                          MachineBasicBlock::iterator MI,
                                          unsigned DestReg, int FrameIdx,
-                                        const TargetRegisterClass *RC) const{
+                                        const TargetRegisterClass *RC,
+                                        const TargetRegisterInfo *TRI) const {
    const MachineFunction &MF = *MBB.getParent();
-  bool isAligned = (RI.getStackAlignment() >= 16) ||
-    RI.needsStackRealignment(MF);
+  bool isAligned = (RI.getStackAlignment() >= 16) || RI.canRealignStack(MF);
    unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
-  DebugLoc DL = DebugLoc::getUnknownLoc();
-  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  DebugLoc DL = MBB.findDebugLoc(MI);
    addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx);
  }
  
@@ -2125,9 +2253,9 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
                                   MachineInstr::mmo_iterator MMOBegin,
                                   MachineInstr::mmo_iterator MMOEnd,
                                   SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  bool isAligned = (*MMOBegin)->getAlignment() >= 16;
+  bool isAligned = *MMOBegin && (*MMOBegin)->getAlignment() >= 16;
    unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
-  DebugLoc DL = DebugLoc::getUnknownLoc();
+  DebugLoc DL;
    MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
    for (unsigned i = 0, e = Addr.size(); i != e; ++i)
      MIB.addOperand(Addr[i]);
@@ -2137,12 +2265,12 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
  
  bool X86InstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                               MachineBasicBlock::iterator MI,
-                                const std::vector<CalleeSavedInfo> &CSI) const {
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                          const TargetRegisterInfo *TRI) const {
    if (CSI.empty())
      return false;
  
-  DebugLoc DL = DebugLoc::getUnknownLoc();
-  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  DebugLoc DL = MBB.findDebugLoc(MI);
  
    bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
    bool isWin64 = TM.getSubtarget<X86Subtarget>().isTargetWin64();
@@ -2156,17 +2284,17 @@ bool X86InstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
    unsigned Opc = is64Bit ? X86::PUSH64r : X86::PUSH32r;
    for (unsigned i = CSI.size(); i != 0; --i) {
      unsigned Reg = CSI[i-1].getReg();
-    const TargetRegisterClass *RegClass = CSI[i-1].getRegClass();
      // Add the callee-saved register as live-in. It's killed at the spill.
      MBB.addLiveIn(Reg);
      if (Reg == FPReg)
        // X86RegisterInfo::emitPrologue will handle spilling of frame register.
        continue;
-    if (RegClass != &X86::VR128RegClass && !isWin64) {
+    if (!X86::VR128RegClass.contains(Reg) && !isWin64) {
        CalleeFrameSize += SlotSize;
        BuildMI(MBB, MI, DL, get(Opc)).addReg(Reg, RegState::Kill);
      } else {
-      storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(), RegClass);
+      storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(),
+                          &X86::VR128RegClass, &RI);
      }
    }
  
@@ -2176,12 +2304,12 @@ bool X86InstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
  
  bool X86InstrInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
                                                 MachineBasicBlock::iterator MI,
-                                const std::vector<CalleeSavedInfo> &CSI) const {
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                          const TargetRegisterInfo *TRI) const {
    if (CSI.empty())
      return false;
  
-  DebugLoc DL = DebugLoc::getUnknownLoc();
-  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  DebugLoc DL = MBB.findDebugLoc(MI);
  
    MachineFunction &MF = *MBB.getParent();
    unsigned FPReg = RI.getFrameRegister(MF);
@@ -2193,16 +2321,29 @@ bool X86InstrInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
      if (Reg == FPReg)
        // X86RegisterInfo::emitEpilogue will handle restoring of frame register.
        continue;
-    const TargetRegisterClass *RegClass = CSI[i].getRegClass();
-    if (RegClass != &X86::VR128RegClass && !isWin64) {
+    if (!X86::VR128RegClass.contains(Reg) && !isWin64) {
        BuildMI(MBB, MI, DL, get(Opc), Reg);
      } else {
-      loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RegClass);
+      loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(),
+                           &X86::VR128RegClass, &RI);
      }
    }
    return true;
  }
  
+MachineInstr*
+X86InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
+                                       int FrameIx, uint64_t Offset,
+                                       const MDNode *MDPtr,
+                                       DebugLoc DL) const {
+  X86AddressMode AM;
+  AM.BaseType = X86AddressMode::FrameIndexBase;
+  AM.Base.FrameIndex = FrameIx;
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(X86::DBG_VALUE));
+  addFullAddress(MIB, AM).addImm(Offset).addMetadata(MDPtr);
+  return &*MIB;
+}
+
  static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
                                       const SmallVectorImpl<MachineOperand> &MOs,
                                       MachineInstr *MI,
@@ -2290,8 +2431,12 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
      OpcodeTablePtr = &RegOp2MemOpTable2Addr;
      isTwoAddrFold = true;
    } else if (i == 0) { // If operand 0
-    if (MI->getOpcode() == X86::MOV32r0)
+    if (MI->getOpcode() == X86::MOV64r0)
+      NewMI = MakeM0Inst(*this, X86::MOV64mi32, MOs, MI);
+    else if (MI->getOpcode() == X86::MOV32r0)
        NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI);
+    else if (MI->getOpcode() == X86::MOV16r0)
+      NewMI = MakeM0Inst(*this, X86::MOV16mi, MOs, MI);
      else if (MI->getOpcode() == X86::MOV8r0)
        NewMI = MakeM0Inst(*this, X86::MOV8mi, MOs, MI);
      if (NewMI)
@@ -2344,9 +2489,9 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
          unsigned DstReg = NewMI->getOperand(0).getReg();
          if (TargetRegisterInfo::isPhysicalRegister(DstReg))
            NewMI->getOperand(0).setReg(RI.getSubReg(DstReg,
-                                                   4/*x86_subreg_32bit*/));
+                                                   X86::sub_32bit));
          else
-          NewMI->getOperand(0).setSubReg(4/*x86_subreg_32bit*/);
+          NewMI->getOperand(0).setSubReg(X86::sub_32bit);
        }
        return NewMI;
      }
@@ -2354,7 +2499,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
    
    // No fusion 
    if (PrintFailedFusing)
-    errs() << "We failed to fuse operand " << i << " in " << *MI;
+    dbgs() << "We failed to fuse operand " << i << " in " << *MI;
    return NULL;
  }
  
@@ -2392,9 +2537,9 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
      switch (MI->getOpcode()) {
      default: return NULL;
      case X86::TEST8rr:  NewOpc = X86::CMP8ri; RCSize = 1; break;
-    case X86::TEST16rr: NewOpc = X86::CMP16ri; RCSize = 2; break;
-    case X86::TEST32rr: NewOpc = X86::CMP32ri; RCSize = 4; break;
-    case X86::TEST64rr: NewOpc = X86::CMP64ri32; RCSize = 8; break;
+    case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break;
+    case X86::TEST32rr: NewOpc = X86::CMP32ri8; RCSize = 4; break;
+    case X86::TEST64rr: NewOpc = X86::CMP64ri8; RCSize = 8; break;
      }
      // Check if it's safe to fold the load. If the size of the object is
      // narrower than the load width, then it's not.
@@ -2441,7 +2586,9 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
      Alignment = (*LoadMI->memoperands_begin())->getAlignment();
    else
      switch (LoadMI->getOpcode()) {
-    case X86::V_SET0:
+    case X86::V_SET0PS:
+    case X86::V_SET0PD:
+    case X86::V_SET0PI:
      case X86::V_SETALLONES:
        Alignment = 16;
        break;
@@ -2459,9 +2606,9 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
      switch (MI->getOpcode()) {
      default: return NULL;
      case X86::TEST8rr:  NewOpc = X86::CMP8ri; break;
-    case X86::TEST16rr: NewOpc = X86::CMP16ri; break;
-    case X86::TEST32rr: NewOpc = X86::CMP32ri; break;
-    case X86::TEST64rr: NewOpc = X86::CMP64ri32; break;
+    case X86::TEST16rr: NewOpc = X86::CMP16ri8; break;
+    case X86::TEST32rr: NewOpc = X86::CMP32ri8; break;
+    case X86::TEST64rr: NewOpc = X86::CMP64ri8; break;
      }
      // Change to CMPXXri r, 0 first.
      MI->setDesc(get(NewOpc));
@@ -2469,15 +2616,22 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
    } else if (Ops.size() != 1)
      return NULL;
  
-  SmallVector<MachineOperand,X86AddrNumOperands> MOs;
+  SmallVector<MachineOperand,X86::AddrNumOperands> MOs;
    switch (LoadMI->getOpcode()) {
-  case X86::V_SET0:
+  case X86::V_SET0PS:
+  case X86::V_SET0PD:
+  case X86::V_SET0PI:
    case X86::V_SETALLONES:
    case X86::FsFLD0SD:
    case X86::FsFLD0SS: {
-    // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
+    // Folding a V_SET0P? or V_SETALLONES as a load, to ease register pressure.
      // Create a constant-pool entry and operands to load from it.
  
+    // Medium and large mode can't fold loads this way.
+    if (TM.getCodeModel() != CodeModel::Small &&
+        TM.getCodeModel() != CodeModel::Kernel)
+      return NULL;
+
      // x86-32 PIC requires a PIC base register for constant pools.
      unsigned PICBase = 0;
      if (TM.getRelocationModel() == Reloc::PIC_) {
@@ -2500,7 +2654,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
        Ty = Type::getDoubleTy(MF.getFunction()->getContext());
      else
        Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
-    Constant *C = LoadMI->getOpcode() == X86::V_SETALLONES ?
+    const Constant *C = LoadMI->getOpcode() == X86::V_SETALLONES ?
                      Constant::getAllOnesValue(Ty) :
                      Constant::getNullValue(Ty);
      unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
@@ -2516,7 +2670,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
    default: {
      // Folding a normal load. Just copy the load's address operands.
      unsigned NumOps = LoadMI->getDesc().getNumOperands();
-    for (unsigned i = NumOps - X86AddrNumOperands; i != NumOps; ++i)
+    for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
        MOs.push_back(LoadMI->getOperand(i));
      break;
    }
@@ -2559,7 +2713,9 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
    } else if (OpNum == 0) { // If operand 0
      switch (Opc) {
      case X86::MOV8r0:
+    case X86::MOV16r0:
      case X86::MOV32r0:
+    case X86::MOV64r0:
        return true;
      default: break;
      }
@@ -2587,7 +2743,6 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
      MemOp2RegOpTable.find((unsigned*)MI->getOpcode());
    if (I == MemOp2RegOpTable.end())
      return false;
-  DebugLoc dl = MI->getDebugLoc();
    unsigned Opc = I->second.first;
    unsigned Index = I->second.second & 0xf;
    bool FoldedLoad = I->second.second & (1 << 4);
@@ -2602,13 +2757,20 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
    const TargetInstrDesc &TID = get(Opc);
    const TargetOperandInfo &TOI = TID.OpInfo[Index];
    const TargetRegisterClass *RC = TOI.getRegClass(&RI);
-  SmallVector<MachineOperand, X86AddrNumOperands> AddrOps;
+  if (!MI->hasOneMemOperand() &&
+      RC == &X86::VR128RegClass &&
+      !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+    // Without memoperands, loadRegFromAddr and storeRegToStackSlot will
+    // conservatively assume the address is unaligned. That's bad for
+    // performance.
+    return false;
+  SmallVector<MachineOperand, X86::AddrNumOperands> AddrOps;
    SmallVector<MachineOperand,2> BeforeOps;
    SmallVector<MachineOperand,2> AfterOps;
    SmallVector<MachineOperand,4> ImpOps;
    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
      MachineOperand &Op = MI->getOperand(i);
-    if (i >= Index && i < Index + X86AddrNumOperands)
+    if (i >= Index && i < Index + X86::AddrNumOperands)
        AddrOps.push_back(Op);
      else if (Op.isReg() && Op.isImplicit())
        ImpOps.push_back(Op);
@@ -2627,7 +2789,7 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
      loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs.first, MMOs.second, NewMIs);
      if (UnfoldStore) {
        // Address operands cannot be marked isKill.
-      for (unsigned i = 1; i != 1 + X86AddrNumOperands; ++i) {
+      for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) {
          MachineOperand &MO = NewMIs[0]->getOperand(i);
          if (MO.isReg())
            MO.setIsKill(false);
@@ -2661,16 +2823,22 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
    switch (DataMI->getOpcode()) {
    default: break;
    case X86::CMP64ri32:
+  case X86::CMP64ri8:
    case X86::CMP32ri:
+  case X86::CMP32ri8:
    case X86::CMP16ri:
+  case X86::CMP16ri8:
    case X86::CMP8ri: {
      MachineOperand &MO0 = DataMI->getOperand(0);
      MachineOperand &MO1 = DataMI->getOperand(1);
      if (MO1.getImm() == 0) {
        switch (DataMI->getOpcode()) {
        default: break;
+      case X86::CMP64ri8:
        case X86::CMP64ri32: NewOpc = X86::TEST64rr; break;
+      case X86::CMP32ri8:
        case X86::CMP32ri:   NewOpc = X86::TEST32rr; break;
+      case X86::CMP16ri8:
        case X86::CMP16ri:   NewOpc = X86::TEST16rr; break;
        case X86::CMP8ri:    NewOpc = X86::TEST8rr; break;
        }
@@ -2718,7 +2886,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
    unsigned NumOps = N->getNumOperands();
    for (unsigned i = 0; i != NumOps-1; ++i) {
      SDValue Op = N->getOperand(i);
-    if (i >= Index-NumDefs && i < Index-NumDefs + X86AddrNumOperands)
+    if (i >= Index-NumDefs && i < Index-NumDefs + X86::AddrNumOperands)
        AddrOps.push_back(Op);
      else if (i < Index-NumDefs)
        BeforeOps.push_back(Op);
@@ -2737,7 +2905,12 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
                MachineInstr::mmo_iterator> MMOs =
        MF.extractLoadMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
                              cast<MachineSDNode>(N)->memoperands_end());
-    bool isAligned = (*MMOs.first)->getAlignment() >= 16;
+    if (!(*MMOs.first) &&
+        RC == &X86::VR128RegClass &&
+        !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+      // Do not introduce a slow unaligned load.
+      return false;
+    bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= 16;
      Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, TM), dl,
                                VT, MVT::Other, &AddrOps[0], AddrOps.size());
      NewNodes.push_back(Load);
@@ -2774,7 +2947,12 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
                MachineInstr::mmo_iterator> MMOs =
        MF.extractStoreMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
                               cast<MachineSDNode>(N)->memoperands_end());
-    bool isAligned = (*MMOs.first)->getAlignment() >= 16;
+    if (!(*MMOs.first) &&
+        RC == &X86::VR128RegClass &&
+        !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+      // Do not introduce a slow unaligned store.
+      return false;
+    bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= 16;
      SDNode *Store = DAG.getMachineNode(getStoreRegOpcode(0, DstRC,
                                                           isAligned, TM),
                                         dl, MVT::Other,
@@ -2806,6 +2984,135 @@ unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
    return I->second.first;
  }
  
+bool
+X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+                                     int64_t &Offset1, int64_t &Offset2) const {
+  if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
+    return false;
+  unsigned Opc1 = Load1->getMachineOpcode();
+  unsigned Opc2 = Load2->getMachineOpcode();
+  switch (Opc1) {
+  default: return false;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp32m:
+  case X86::LD_Fp64m:
+  case X86::LD_Fp80m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+  case X86::FsMOVAPSrm:
+  case X86::FsMOVAPDrm:
+  case X86::MOVAPSrm:
+  case X86::MOVUPSrm:
+  case X86::MOVUPSrm_Int:
+  case X86::MOVAPDrm:
+  case X86::MOVDQArm:
+  case X86::MOVDQUrm:
+  case X86::MOVDQUrm_Int:
+    break;
+  }
+  switch (Opc2) {
+  default: return false;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp32m:
+  case X86::LD_Fp64m:
+  case X86::LD_Fp80m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+  case X86::FsMOVAPSrm:
+  case X86::FsMOVAPDrm:
+  case X86::MOVAPSrm:
+  case X86::MOVUPSrm:
+  case X86::MOVUPSrm_Int:
+  case X86::MOVAPDrm:
+  case X86::MOVDQArm:
+  case X86::MOVDQUrm:
+  case X86::MOVDQUrm_Int:
+    break;
+  }
+
+  // Check if chain operands and base addresses match.
+  if (Load1->getOperand(0) != Load2->getOperand(0) ||
+      Load1->getOperand(5) != Load2->getOperand(5))
+    return false;
+  // Segment operands should match as well.
+  if (Load1->getOperand(4) != Load2->getOperand(4))
+    return false;
+  // Scale should be 1, Index should be Reg0.
+  if (Load1->getOperand(1) == Load2->getOperand(1) &&
+      Load1->getOperand(2) == Load2->getOperand(2)) {
+    if (cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue() != 1)
+      return false;
+
+    // Now let's examine the displacements.
+    if (isa<ConstantSDNode>(Load1->getOperand(3)) &&
+        isa<ConstantSDNode>(Load2->getOperand(3))) {
+      Offset1 = cast<ConstantSDNode>(Load1->getOperand(3))->getSExtValue();
+      Offset2 = cast<ConstantSDNode>(Load2->getOperand(3))->getSExtValue();
+      return true;
+    }
+  }
+  return false;
+}
+
+bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                                           int64_t Offset1, int64_t Offset2,
+                                           unsigned NumLoads) const {
+  assert(Offset2 > Offset1);
+  if ((Offset2 - Offset1) / 8 > 64)
+    return false;
+
+  unsigned Opc1 = Load1->getMachineOpcode();
+  unsigned Opc2 = Load2->getMachineOpcode();
+  if (Opc1 != Opc2)
+    return false;  // FIXME: overly conservative?
+
+  switch (Opc1) {
+  default: break;
+  case X86::LD_Fp32m:
+  case X86::LD_Fp64m:
+  case X86::LD_Fp80m:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+    return false;
+  }
+
+  EVT VT = Load1->getValueType(0);
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:
+    // XMM registers. In 64-bit mode we can be a bit more aggressive since we
+    // have 16 of them to play with.
+    if (TM.getSubtargetImpl()->is64Bit()) {
+      if (NumLoads >= 3)
+        return false;
+    } else if (NumLoads) {
+      return false;
+    }
+    break;
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+  case MVT::i64:
+  case MVT::f32:
+  case MVT::f64:
+    if (NumLoads)
+      return false;
+    break;
+  }
+
+  return true;
+}
+
+
  bool X86InstrInfo::
  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
    assert(Cond.size() == 1 && "Invalid X86 branch condition!");
@@ -2824,22 +3131,11 @@ isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
             RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass);
  }
  
-unsigned X86InstrInfo::sizeOfImm(const TargetInstrDesc *Desc) {
-  switch (Desc->TSFlags & X86II::ImmMask) {
-  case X86II::Imm8:   return 1;
-  case X86II::Imm16:  return 2;
-  case X86II::Imm32:  return 4;
-  case X86II::Imm64:  return 8;
-  default: llvm_unreachable("Immediate size not set!");
-    return 0;
-  }
-}
  
-/// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended register?
-/// e.g. r8, xmm8, etc.
-bool X86InstrInfo::isX86_64ExtendedReg(const MachineOperand &MO) {
-  if (!MO.isReg()) return false;
-  switch (MO.getReg()) {
+/// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or higher)
+/// register?  e.g. r8, xmm8, xmm13, etc.
+bool X86InstrInfo::isX86_64ExtendedReg(unsigned RegNo) {
+  switch (RegNo) {
    default: break;
    case X86::R8:    case X86::R9:    case X86::R10:   case X86::R11:
    case X86::R12:   case X86::R13:   case X86::R14:   case X86::R15:
@@ -2922,7 +3218,7 @@ unsigned X86InstrInfo::determineREX(const MachineInstr &MI) {
      case X86II::MRM4m: case X86II::MRM5m:
      case X86II::MRM6m: case X86II::MRM7m:
      case X86II::MRMDestMem: {
-      unsigned e = (isTwoAddr ? X86AddrNumOperands+1 : X86AddrNumOperands);
+      unsigned e = (isTwoAddr ? X86::AddrNumOperands+1 : X86::AddrNumOperands);
        i = isTwoAddr ? 1 : 0;
        if (NumOps > e && isX86_64ExtendedReg(MI.getOperand(e)))
          REX |= 1 << 2;
@@ -3193,24 +3489,25 @@ static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
      switch (Opcode) {
      default: 
        break;
-    case TargetInstrInfo::INLINEASM: {
+    case TargetOpcode::INLINEASM: {
        const MachineFunction *MF = MI.getParent()->getParent();
        const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
        FinalSize += TII.getInlineAsmLength(MI.getOperand(0).getSymbolName(),
                                            *MF->getTarget().getMCAsmInfo());
        break;
      }
-    case TargetInstrInfo::DBG_LABEL:
-    case TargetInstrInfo::EH_LABEL:
+    case TargetOpcode::DBG_LABEL:
+    case TargetOpcode::EH_LABEL:
+    case TargetOpcode::DBG_VALUE:
        break;
-    case TargetInstrInfo::IMPLICIT_DEF:
-    case TargetInstrInfo::KILL:
+    case TargetOpcode::IMPLICIT_DEF:
+    case TargetOpcode::KILL:
      case X86::FP_REG_KILL:
        break;
      case X86::MOVPC32r: {
        // This emits the "call" portion of this pseudo instruction.
        ++FinalSize;
-      FinalSize += sizeConstant(X86InstrInfo::sizeOfImm(Desc));
+      FinalSize += sizeConstant(X86II::getSizeOfImm(Desc->TSFlags));
        break;
      }
      }
@@ -3228,7 +3525,7 @@ static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
        } else if (MO.isSymbol()) {
          FinalSize += sizeExternalSymbolAddress(false);
        } else if (MO.isImm()) {
-        FinalSize += sizeConstant(X86InstrInfo::sizeOfImm(Desc));
+        FinalSize += sizeConstant(X86II::getSizeOfImm(Desc->TSFlags));
        } else {
          llvm_unreachable("Unknown RawFrm operand!");
        }
@@ -3241,7 +3538,7 @@ static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
      
      if (CurOp != NumOps) {
        const MachineOperand &MO1 = MI.getOperand(CurOp++);
-      unsigned Size = X86InstrInfo::sizeOfImm(Desc);
+      unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
        if (MO1.isImm())
          FinalSize += sizeConstant(Size);
        else {
@@ -3266,17 +3563,17 @@ static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
      CurOp += 2;
      if (CurOp != NumOps) {
        ++CurOp;
-      FinalSize += sizeConstant(X86InstrInfo::sizeOfImm(Desc));
+      FinalSize += sizeConstant(X86II::getSizeOfImm(Desc->TSFlags));
      }
      break;
    }
    case X86II::MRMDestMem: {
      ++FinalSize;
      FinalSize += getMemModRMByteSize(MI, CurOp, IsPIC, Is64BitMode);
-    CurOp +=  X86AddrNumOperands + 1;
+    CurOp +=  X86::AddrNumOperands + 1;
      if (CurOp != NumOps) {
        ++CurOp;
-      FinalSize += sizeConstant(X86InstrInfo::sizeOfImm(Desc));
+      FinalSize += sizeConstant(X86II::getSizeOfImm(Desc->TSFlags));
      }
      break;
    }
@@ -3287,24 +3584,17 @@ static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
      CurOp += 2;
      if (CurOp != NumOps) {
        ++CurOp;
-      FinalSize += sizeConstant(X86InstrInfo::sizeOfImm(Desc));
+      FinalSize += sizeConstant(X86II::getSizeOfImm(Desc->TSFlags));
      }
      break;
  
    case X86II::MRMSrcMem: {
-    int AddrOperands;
-    if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
-        Opcode == X86::LEA16r || Opcode == X86::LEA32r)
-      AddrOperands = X86AddrNumOperands - 1; // No segment register
-    else
-      AddrOperands = X86AddrNumOperands;
-
      ++FinalSize;
      FinalSize += getMemModRMByteSize(MI, CurOp+1, IsPIC, Is64BitMode);
-    CurOp += AddrOperands + 1;
+    CurOp += X86::AddrNumOperands + 1;
      if (CurOp != NumOps) {
        ++CurOp;
-      FinalSize += sizeConstant(X86InstrInfo::sizeOfImm(Desc));
+      FinalSize += sizeConstant(X86II::getSizeOfImm(Desc->TSFlags));
      }
      break;
    }
@@ -3329,7 +3619,7 @@ static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
  
      if (CurOp != NumOps) {
        const MachineOperand &MO1 = MI.getOperand(CurOp++);
-      unsigned Size = X86InstrInfo::sizeOfImm(Desc);
+      unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
        if (MO1.isImm())
          FinalSize += sizeConstant(Size);
        else {
@@ -3355,11 +3645,11 @@ static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
      
      ++FinalSize;
      FinalSize += getMemModRMByteSize(MI, CurOp, IsPIC, Is64BitMode);
-    CurOp += X86AddrNumOperands;
+    CurOp += X86::AddrNumOperands;
  
      if (CurOp != NumOps) {
        const MachineOperand &MO = MI.getOperand(CurOp++);
-      unsigned Size = X86InstrInfo::sizeOfImm(Desc);
+      unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
        if (MO.isImm())
          FinalSize += sizeConstant(Size);
        else {
@@ -3377,6 +3667,14 @@ static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
        }
      }
      break;
+    
+  case X86II::MRM_C1:
+  case X86II::MRM_C8:
+  case X86II::MRM_C9:
+  case X86II::MRM_E8:
+  case X86II::MRM_F0:
+    FinalSize += 2;
+    break;
    }
  
    case X86II::MRMInitReg:
@@ -3391,7 +3689,7 @@ static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
      std::string msg;
      raw_string_ostream Msg(msg);
      Msg << "Cannot determine size: " << MI;
-    llvm_report_error(Msg.str());
+    report_fatal_error(Msg.str());
    }
    
  
@@ -3425,8 +3723,7 @@ unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
    // Insert the set of GlobalBaseReg into the first MBB of the function
    MachineBasicBlock &FirstMBB = MF->front();
    MachineBasicBlock::iterator MBBI = FirstMBB.begin();
-  DebugLoc DL = DebugLoc::getUnknownLoc();
-  if (MBBI != FirstMBB.end()) DL = MBBI->getDebugLoc();
+  DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
    MachineRegisterInfo &RegInfo = MF->getRegInfo();
    unsigned PC = RegInfo.createVirtualRegister(X86::GR32RegisterClass);
    
@@ -3450,3 +3747,56 @@ unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
    X86FI->setGlobalBaseReg(GlobalBaseReg);
    return GlobalBaseReg;
  }
+
+// These are the replaceable SSE instructions. Some of these have Int variants
+// that we don't include here. We don't want to replace instructions selected
+// by intrinsics.
+static const unsigned ReplaceableInstrs[][3] = {
+  //PackedInt       PackedSingle     PackedDouble
+  { X86::MOVAPSmr,   X86::MOVAPDmr,  X86::MOVDQAmr  },
+  { X86::MOVAPSrm,   X86::MOVAPDrm,  X86::MOVDQArm  },
+  { X86::MOVAPSrr,   X86::MOVAPDrr,  X86::MOVDQArr  },
+  { X86::MOVUPSmr,   X86::MOVUPDmr,  X86::MOVDQUmr  },
+  { X86::MOVUPSrm,   X86::MOVUPDrm,  X86::MOVDQUrm  },
+  { X86::MOVNTPSmr,  X86::MOVNTPDmr, X86::MOVNTDQmr },
+  { X86::ANDNPSrm,   X86::ANDNPDrm,  X86::PANDNrm   },
+  { X86::ANDNPSrr,   X86::ANDNPDrr,  X86::PANDNrr   },
+  { X86::ANDPSrm,    X86::ANDPDrm,   X86::PANDrm    },
+  { X86::ANDPSrr,    X86::ANDPDrr,   X86::PANDrr    },
+  { X86::ORPSrm,     X86::ORPDrm,    X86::PORrm     },
+  { X86::ORPSrr,     X86::ORPDrr,    X86::PORrr     },
+  { X86::V_SET0PS,   X86::V_SET0PD,  X86::V_SET0PI  },
+  { X86::XORPSrm,    X86::XORPDrm,   X86::PXORrm    },
+  { X86::XORPSrr,    X86::XORPDrr,   X86::PXORrr    },
+};
+
+// FIXME: Some shuffle and unpack instructions have equivalents in different
+// domains, but they require a bit more work than just switching opcodes.
+
+static const unsigned *lookup(unsigned opcode, unsigned domain) {
+  for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i)
+    if (ReplaceableInstrs[i][domain-1] == opcode)
+      return ReplaceableInstrs[i];
+  return 0;
+}
+
+std::pair<uint16_t, uint16_t>
+X86InstrInfo::GetSSEDomain(const MachineInstr *MI) const {
+  uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+  return std::make_pair(domain,
+                        domain && lookup(MI->getOpcode(), domain) ? 0xe : 0);
+}
+
+void X86InstrInfo::SetSSEDomain(MachineInstr *MI, unsigned Domain) const {
+  assert(Domain>0 && Domain<4 && "Invalid execution domain");
+  uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+  assert(dom && "Not an SSE instruction");
+  const unsigned *table = lookup(MI->getOpcode(), dom);
+  assert(table && "Cannot change domain");
+  MI->setDesc(get(table[Domain-1]));
+}
+
+/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+  NopInst.setOpcode(X86::NOOP);
+}