[x86] enable machine combiner reassociations for scalar single-precision minimums

[oota-llvm.git] / lib / Target / X86 / X86InstrInfo.cpp
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp

index c8d5c591ba9928ee28392a908482fa91aba4c91c..bd3a3e5165da4349f27b281f6999da2edaebbc4a 100644 (file)
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -91,7 +91,7 @@ enum {
    TB_ALIGN_MASK  = 0xff << TB_ALIGN_SHIFT
  };
  
-struct X86OpTblEntry {
+struct X86MemoryFoldTableEntry {
    uint16_t RegOp;
    uint16_t MemOp;
    uint16_t Flags;
@@ -104,9 +104,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      : X86GenInstrInfo(
            (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32),
            (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)),
-      Subtarget(STI), RI(STI) {
+      Subtarget(STI), RI(STI.getTargetTriple()) {
  
-  static const X86OpTblEntry OpTbl2Addr[] = {
+  static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
      { X86::ADC32ri,     X86::ADC32mi,    0 },
      { X86::ADC32ri8,    X86::ADC32mi8,   0 },
      { X86::ADC32rr,     X86::ADC32mr,    0 },
@@ -269,17 +269,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::XOR8rr,      X86::XOR8mr,     0 }
    };
  
-  for (unsigned i = 0, e = array_lengthof(OpTbl2Addr); i != e; ++i) {
-    unsigned RegOp = OpTbl2Addr[i].RegOp;
-    unsigned MemOp = OpTbl2Addr[i].MemOp;
-    unsigned Flags = OpTbl2Addr[i].Flags;
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2Addr) {
      AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable,
-                  RegOp, MemOp,
+                  Entry.RegOp, Entry.MemOp,
                    // Index 0, folded load and store, no alignment requirement.
-                  Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
+                  Entry.Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
    }
  
-  static const X86OpTblEntry OpTbl0[] = {
+  static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
      { X86::BT16ri8,     X86::BT16mi8,       TB_FOLDED_LOAD },
      { X86::BT32ri8,     X86::BT32mi8,       TB_FOLDED_LOAD },
      { X86::BT64ri8,     X86::BT64mi8,       TB_FOLDED_LOAD },
@@ -335,6 +332,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::MUL8r,       X86::MUL8m,         TB_FOLDED_LOAD },
      { X86::PEXTRDrr,    X86::PEXTRDmr,      TB_FOLDED_STORE },
      { X86::PEXTRQrr,    X86::PEXTRQmr,      TB_FOLDED_STORE },
+    { X86::PUSH16r,     X86::PUSH16rmm,     TB_FOLDED_LOAD },
+    { X86::PUSH32r,     X86::PUSH32rmm,     TB_FOLDED_LOAD },
+    { X86::PUSH64r,     X86::PUSH64rmm,     TB_FOLDED_LOAD },
      { X86::SETAEr,      X86::SETAEm,        TB_FOLDED_STORE },
      { X86::SETAr,       X86::SETAm,         TB_FOLDED_STORE },
      { X86::SETBEr,      X86::SETBEm,        TB_FOLDED_STORE },
@@ -358,6 +358,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::TEST32ri,    X86::TEST32mi,      TB_FOLDED_LOAD },
      { X86::TEST64ri32,  X86::TEST64mi32,    TB_FOLDED_LOAD },
      { X86::TEST8ri,     X86::TEST8mi,       TB_FOLDED_LOAD },
+
      // AVX 128-bit versions of foldable instructions
      { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr,  TB_FOLDED_STORE  },
      { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
@@ -372,6 +373,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::VMOVUPSrr,   X86::VMOVUPSmr,     TB_FOLDED_STORE },
      { X86::VPEXTRDrr,   X86::VPEXTRDmr,     TB_FOLDED_STORE },
      { X86::VPEXTRQrr,   X86::VPEXTRQmr,     TB_FOLDED_STORE },
+
      // AVX 256-bit foldable instructions
      { X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
      { X86::VMOVAPDYrr,  X86::VMOVAPDYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
@@ -379,6 +381,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::VMOVDQAYrr,  X86::VMOVDQAYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
      { X86::VMOVUPDYrr,  X86::VMOVUPDYmr,    TB_FOLDED_STORE },
      { X86::VMOVUPSYrr,  X86::VMOVUPSYmr,    TB_FOLDED_STORE },
+
      // AVX-512 foldable instructions
      { X86::VMOVPDI2DIZrr,   X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
      { X86::VMOVAPDZrr,      X86::VMOVAPDZmr,    TB_FOLDED_STORE | TB_ALIGN_64 },
@@ -391,6 +394,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::VMOVDQU16Zrr,    X86::VMOVDQU16Zmr,  TB_FOLDED_STORE },
      { X86::VMOVDQU32Zrr,    X86::VMOVDQU32Zmr,  TB_FOLDED_STORE },
      { X86::VMOVDQU64Zrr,    X86::VMOVDQU64Zmr,  TB_FOLDED_STORE },
+
      // AVX-512 foldable instructions (256-bit versions)
      { X86::VMOVAPDZ256rr,      X86::VMOVAPDZ256mr,    TB_FOLDED_STORE | TB_ALIGN_32 },
      { X86::VMOVAPSZ256rr,      X86::VMOVAPSZ256mr,    TB_FOLDED_STORE | TB_ALIGN_32 },
@@ -402,6 +406,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::VMOVDQU16Z256rr,    X86::VMOVDQU16Z256mr,  TB_FOLDED_STORE },
      { X86::VMOVDQU32Z256rr,    X86::VMOVDQU32Z256mr,  TB_FOLDED_STORE },
      { X86::VMOVDQU64Z256rr,    X86::VMOVDQU64Z256mr,  TB_FOLDED_STORE },
+
      // AVX-512 foldable instructions (128-bit versions)
      { X86::VMOVAPDZ128rr,      X86::VMOVAPDZ128mr,    TB_FOLDED_STORE | TB_ALIGN_16 },
      { X86::VMOVAPSZ128rr,      X86::VMOVAPSZ128mr,    TB_FOLDED_STORE | TB_ALIGN_16 },
@@ -413,20 +418,24 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::VMOVDQU16Z128rr,    X86::VMOVDQU16Z128mr,  TB_FOLDED_STORE },
      { X86::VMOVDQU32Z128rr,    X86::VMOVDQU32Z128mr,  TB_FOLDED_STORE },
      { X86::VMOVDQU64Z128rr,    X86::VMOVDQU64Z128mr,  TB_FOLDED_STORE },
+
      // F16C foldable instructions
      { X86::VCVTPS2PHrr,        X86::VCVTPS2PHmr,      TB_FOLDED_STORE },
      { X86::VCVTPS2PHYrr,       X86::VCVTPS2PHYmr,     TB_FOLDED_STORE }
    };
  
-  for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) {
-    unsigned RegOp      = OpTbl0[i].RegOp;
-    unsigned MemOp      = OpTbl0[i].MemOp;
-    unsigned Flags      = OpTbl0[i].Flags;
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable0) {
      AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable,
-                  RegOp, MemOp, TB_INDEX_0 | Flags);
+                  Entry.RegOp, Entry.MemOp, TB_INDEX_0 | Entry.Flags);
    }
  
-  static const X86OpTblEntry OpTbl1[] = {
+  static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
+    { X86::BSF16rr,         X86::BSF16rm,             0 },
+    { X86::BSF32rr,         X86::BSF32rm,             0 },
+    { X86::BSF64rr,         X86::BSF64rm,             0 },
+    { X86::BSR16rr,         X86::BSR16rm,             0 },
+    { X86::BSR32rr,         X86::BSR32rm,             0 },
+    { X86::BSR64rr,         X86::BSR64rm,             0 },
      { X86::CMP16rr,         X86::CMP16rm,             0 },
      { X86::CMP32rr,         X86::CMP32rm,             0 },
      { X86::CMP64rr,         X86::CMP64rm,             0 },
@@ -520,11 +529,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::PSHUFLWri,       X86::PSHUFLWmi,           TB_ALIGN_16 },
      { X86::PTESTrr,         X86::PTESTrm,             TB_ALIGN_16 },
      { X86::RCPPSr,          X86::RCPPSm,              TB_ALIGN_16 },
-    { X86::RCPPSr_Int,      X86::RCPPSm_Int,          TB_ALIGN_16 },
+    { X86::RCPSSr,          X86::RCPSSm,              0 },
+    { X86::RCPSSr_Int,      X86::RCPSSm_Int,          0 },
      { X86::ROUNDPDr,        X86::ROUNDPDm,            TB_ALIGN_16 },
      { X86::ROUNDPSr,        X86::ROUNDPSm,            TB_ALIGN_16 },
      { X86::RSQRTPSr,        X86::RSQRTPSm,            TB_ALIGN_16 },
-    { X86::RSQRTPSr_Int,    X86::RSQRTPSm_Int,        TB_ALIGN_16 },
      { X86::RSQRTSSr,        X86::RSQRTSSm,            0 },
      { X86::RSQRTSSr_Int,    X86::RSQRTSSm_Int,        0 },
      { X86::SQRTPDr,         X86::SQRTPDm,             TB_ALIGN_16 },
@@ -540,6 +549,28 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
      { X86::UCOMISDrr,       X86::UCOMISDrm,           0 },
      { X86::UCOMISSrr,       X86::UCOMISSrm,           0 },
+
+    // MMX version of foldable instructions
+    { X86::MMX_CVTPD2PIirr,   X86::MMX_CVTPD2PIirm,   0 },
+    { X86::MMX_CVTPI2PDirr,   X86::MMX_CVTPI2PDirm,   0 },
+    { X86::MMX_CVTPS2PIirr,   X86::MMX_CVTPS2PIirm,   0 },
+    { X86::MMX_CVTTPD2PIirr,  X86::MMX_CVTTPD2PIirm,  0 },
+    { X86::MMX_CVTTPS2PIirr,  X86::MMX_CVTTPS2PIirm,  0 },
+    { X86::MMX_MOVD64to64rr,  X86::MMX_MOVQ64rm,      0 },
+    { X86::MMX_PABSBrr64,     X86::MMX_PABSBrm64,     0 },
+    { X86::MMX_PABSDrr64,     X86::MMX_PABSDrm64,     0 },
+    { X86::MMX_PABSWrr64,     X86::MMX_PABSWrm64,     0 },
+    { X86::MMX_PSHUFWri,      X86::MMX_PSHUFWmi,      0 },
+
+    // 3DNow! version of foldable instructions
+    { X86::PF2IDrr,         X86::PF2IDrm,             0 },
+    { X86::PF2IWrr,         X86::PF2IWrm,             0 },
+    { X86::PFRCPrr,         X86::PFRCPrm,             0 },
+    { X86::PFRSQRTrr,       X86::PFRSQRTrm,           0 },
+    { X86::PI2FDrr,         X86::PI2FDrm,             0 },
+    { X86::PI2FWrr,         X86::PI2FWrm,             0 },
+    { X86::PSWAPDrr,        X86::PSWAPDrm,            0 },
+
      // AVX 128-bit versions of foldable instructions
      { X86::Int_VCOMISDrr,   X86::Int_VCOMISDrm,       0 },
      { X86::Int_VCOMISSrr,   X86::Int_VCOMISSrm,       0 },
@@ -606,18 +637,15 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::VPSHUFLWri,      X86::VPSHUFLWmi,          0 },
      { X86::VPTESTrr,        X86::VPTESTrm,            0 },
      { X86::VRCPPSr,         X86::VRCPPSm,             0 },
-    { X86::VRCPPSr_Int,     X86::VRCPPSm_Int,         0 },
      { X86::VROUNDPDr,       X86::VROUNDPDm,           0 },
      { X86::VROUNDPSr,       X86::VROUNDPSm,           0 },
      { X86::VRSQRTPSr,       X86::VRSQRTPSm,           0 },
-    { X86::VRSQRTPSr_Int,   X86::VRSQRTPSm_Int,       0 },
      { X86::VSQRTPDr,        X86::VSQRTPDm,            0 },
      { X86::VSQRTPSr,        X86::VSQRTPSm,            0 },
      { X86::VTESTPDrr,       X86::VTESTPDrm,           0 },
      { X86::VTESTPSrr,       X86::VTESTPSrm,           0 },
      { X86::VUCOMISDrr,      X86::VUCOMISDrm,          0 },
      { X86::VUCOMISSrr,      X86::VUCOMISSrm,          0 },
-    { X86::VBROADCASTSSrr,  X86::VBROADCASTSSrm,      TB_NO_REVERSE },
  
      // AVX 256-bit foldable instructions
      { X86::VCVTDQ2PDYrr,    X86::VCVTDQ2PDYrm,        0 },
@@ -638,27 +666,94 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::VMOVUPSYrr,      X86::VMOVUPSYrm,          0 },
      { X86::VPERMILPDYri,    X86::VPERMILPDYmi,        0 },
      { X86::VPERMILPSYri,    X86::VPERMILPSYmi,        0 },
+    { X86::VPTESTYrr,       X86::VPTESTYrm,           0 },
      { X86::VRCPPSYr,        X86::VRCPPSYm,            0 },
-    { X86::VRCPPSYr_Int,    X86::VRCPPSYm_Int,        0 },
      { X86::VROUNDYPDr,      X86::VROUNDYPDm,          0 },
      { X86::VROUNDYPSr,      X86::VROUNDYPSm,          0 },
      { X86::VRSQRTPSYr,      X86::VRSQRTPSYm,          0 },
-    { X86::VRSQRTPSYr_Int,  X86::VRSQRTPSYm_Int,      0 },
      { X86::VSQRTPDYr,       X86::VSQRTPDYm,           0 },
      { X86::VSQRTPSYr,       X86::VSQRTPSYm,           0 },
      { X86::VTESTPDYrr,      X86::VTESTPDYrm,          0 },
      { X86::VTESTPSYrr,      X86::VTESTPSYrm,          0 },
-    { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm,     TB_NO_REVERSE },
-    { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm,     TB_NO_REVERSE },
  
      // AVX2 foldable instructions
+
+    // VBROADCASTS{SD}rr register instructions were an AVX2 addition while the
+    // VBROADCASTS{SD}rm memory instructions were available from AVX1.
+    // TB_NO_REVERSE prevents unfolding from introducing an illegal instruction
+    // on AVX1 targets. The VPBROADCAST instructions are all AVX2 instructions
+    // so they don't need an equivalent limitation.
+    { X86::VBROADCASTSSrr,  X86::VBROADCASTSSrm,      TB_NO_REVERSE },
+    { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm,     TB_NO_REVERSE },
+    { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm,     TB_NO_REVERSE },
      { X86::VPABSBrr256,     X86::VPABSBrm256,         0 },
      { X86::VPABSDrr256,     X86::VPABSDrm256,         0 },
      { X86::VPABSWrr256,     X86::VPABSWrm256,         0 },
+    { X86::VPBROADCASTBrr,  X86::VPBROADCASTBrm,      0 },
+    { X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm,     0 },
+    { X86::VPBROADCASTDrr,  X86::VPBROADCASTDrm,      0 },
+    { X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm,     0 },
+    { X86::VPBROADCASTQrr,  X86::VPBROADCASTQrm,      0 },
+    { X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm,     0 },
+    { X86::VPBROADCASTWrr,  X86::VPBROADCASTWrm,      0 },
+    { X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm,     0 },
+    { X86::VPERMPDYri,      X86::VPERMPDYmi,          0 },
+    { X86::VPERMQYri,       X86::VPERMQYmi,           0 },
+    { X86::VPMOVSXBDYrr,    X86::VPMOVSXBDYrm,        0 },
+    { X86::VPMOVSXBQYrr,    X86::VPMOVSXBQYrm,        0 },
+    { X86::VPMOVSXBWYrr,    X86::VPMOVSXBWYrm,        0 },
+    { X86::VPMOVSXDQYrr,    X86::VPMOVSXDQYrm,        0 },
+    { X86::VPMOVSXWDYrr,    X86::VPMOVSXWDYrm,        0 },
+    { X86::VPMOVSXWQYrr,    X86::VPMOVSXWQYrm,        0 },
+    { X86::VPMOVZXBDYrr,    X86::VPMOVZXBDYrm,        0 },
+    { X86::VPMOVZXBQYrr,    X86::VPMOVZXBQYrm,        0 },
+    { X86::VPMOVZXBWYrr,    X86::VPMOVZXBWYrm,        0 },
+    { X86::VPMOVZXDQYrr,    X86::VPMOVZXDQYrm,        0 },
+    { X86::VPMOVZXWDYrr,    X86::VPMOVZXWDYrm,        0 },
+    { X86::VPMOVZXWQYrr,    X86::VPMOVZXWQYrm,        0 },
      { X86::VPSHUFDYri,      X86::VPSHUFDYmi,          0 },
      { X86::VPSHUFHWYri,     X86::VPSHUFHWYmi,         0 },
      { X86::VPSHUFLWYri,     X86::VPSHUFLWYmi,         0 },
  
+    // XOP foldable instructions
+    { X86::VFRCZPDrr,          X86::VFRCZPDrm,        0 },
+    { X86::VFRCZPDrrY,         X86::VFRCZPDrmY,       0 },
+    { X86::VFRCZPSrr,          X86::VFRCZPSrm,        0 },
+    { X86::VFRCZPSrrY,         X86::VFRCZPSrmY,       0 },
+    { X86::VFRCZSDrr,          X86::VFRCZSDrm,        0 },
+    { X86::VFRCZSSrr,          X86::VFRCZSSrm,        0 },
+    { X86::VPHADDBDrr,         X86::VPHADDBDrm,       0 },
+    { X86::VPHADDBQrr,         X86::VPHADDBQrm,       0 },
+    { X86::VPHADDBWrr,         X86::VPHADDBWrm,       0 },
+    { X86::VPHADDDQrr,         X86::VPHADDDQrm,       0 },
+    { X86::VPHADDWDrr,         X86::VPHADDWDrm,       0 },
+    { X86::VPHADDWQrr,         X86::VPHADDWQrm,       0 },
+    { X86::VPHADDUBDrr,        X86::VPHADDUBDrm,      0 },
+    { X86::VPHADDUBQrr,        X86::VPHADDUBQrm,      0 },
+    { X86::VPHADDUBWrr,        X86::VPHADDUBWrm,      0 },
+    { X86::VPHADDUDQrr,        X86::VPHADDUDQrm,      0 },
+    { X86::VPHADDUWDrr,        X86::VPHADDUWDrm,      0 },
+    { X86::VPHADDUWQrr,        X86::VPHADDUWQrm,      0 },
+    { X86::VPHSUBBWrr,         X86::VPHSUBBWrm,       0 },
+    { X86::VPHSUBDQrr,         X86::VPHSUBDQrm,       0 },
+    { X86::VPHSUBWDrr,         X86::VPHSUBWDrm,       0 },
+    { X86::VPROTBri,           X86::VPROTBmi,         0 },
+    { X86::VPROTBrr,           X86::VPROTBmr,         0 },
+    { X86::VPROTDri,           X86::VPROTDmi,         0 },
+    { X86::VPROTDrr,           X86::VPROTDmr,         0 },
+    { X86::VPROTQri,           X86::VPROTQmi,         0 },
+    { X86::VPROTQrr,           X86::VPROTQmr,         0 },
+    { X86::VPROTWri,           X86::VPROTWmi,         0 },
+    { X86::VPROTWrr,           X86::VPROTWmr,         0 },
+    { X86::VPSHABrr,           X86::VPSHABmr,         0 },
+    { X86::VPSHADrr,           X86::VPSHADmr,         0 },
+    { X86::VPSHAQrr,           X86::VPSHAQmr,         0 },
+    { X86::VPSHAWrr,           X86::VPSHAWmr,         0 },
+    { X86::VPSHLBrr,           X86::VPSHLBmr,         0 },
+    { X86::VPSHLDrr,           X86::VPSHLDmr,         0 },
+    { X86::VPSHLQrr,           X86::VPSHLQmr,         0 },
+    { X86::VPSHLWrr,           X86::VPSHLWmr,         0 },
+
      // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions
      { X86::BEXTR32rr,       X86::BEXTR32rm,           0 },
      { X86::BEXTR64rr,       X86::BEXTR64rm,           0 },
@@ -725,6 +820,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::VPABSQZrr,       X86::VPABSQZrm,           0 },
      { X86::VBROADCASTSSZr,  X86::VBROADCASTSSZm,      TB_NO_REVERSE },
      { X86::VBROADCASTSDZr,  X86::VBROADCASTSDZm,      TB_NO_REVERSE },
+
      // AVX-512 foldable instructions (256-bit versions)
      { X86::VMOVAPDZ256rr,      X86::VMOVAPDZ256rm,          TB_ALIGN_32 },
      { X86::VMOVAPSZ256rr,      X86::VMOVAPSZ256rm,          TB_ALIGN_32 },
@@ -738,6 +834,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::VMOVUPSZ256rr,      X86::VMOVUPSZ256rm,          0 },
      { X86::VBROADCASTSSZ256r,  X86::VBROADCASTSSZ256m,      TB_NO_REVERSE },
      { X86::VBROADCASTSDZ256r,  X86::VBROADCASTSDZ256m,      TB_NO_REVERSE },
+
      // AVX-512 foldable instructions (256-bit versions)
      { X86::VMOVAPDZ128rr,      X86::VMOVAPDZ128rm,          TB_ALIGN_16 },
      { X86::VMOVAPSZ128rr,      X86::VMOVAPSZ128rm,          TB_ALIGN_16 },
@@ -750,27 +847,26 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::VMOVUPDZ128rr,      X86::VMOVUPDZ128rm,          0 },
      { X86::VMOVUPSZ128rr,      X86::VMOVUPSZ128rm,          0 },
      { X86::VBROADCASTSSZ128r,  X86::VBROADCASTSSZ128m,      TB_NO_REVERSE },
+
      // F16C foldable instructions
      { X86::VCVTPH2PSrr,        X86::VCVTPH2PSrm,            0 },
      { X86::VCVTPH2PSYrr,       X86::VCVTPH2PSYrm,           0 },
+
      // AES foldable instructions
      { X86::AESIMCrr,              X86::AESIMCrm,              TB_ALIGN_16 },
      { X86::AESKEYGENASSIST128rr,  X86::AESKEYGENASSIST128rm,  TB_ALIGN_16 },
-    { X86::VAESIMCrr,             X86::VAESIMCrm,             TB_ALIGN_16 },
-    { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, TB_ALIGN_16 }
+    { X86::VAESIMCrr,             X86::VAESIMCrm,             0 },
+    { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, 0 }
    };
  
-  for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
-    unsigned RegOp = OpTbl1[i].RegOp;
-    unsigned MemOp = OpTbl1[i].MemOp;
-    unsigned Flags = OpTbl1[i].Flags;
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable1) {
      AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable,
-                  RegOp, MemOp,
+                  Entry.RegOp, Entry.MemOp,
                    // Index 1, folded load
-                  Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
+                  Entry.Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
    }
  
-  static const X86OpTblEntry OpTbl2[] = {
+  static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
      { X86::ADC32rr,         X86::ADC32rm,       0 },
      { X86::ADC64rr,         X86::ADC64rm,       0 },
      { X86::ADD16rr,         X86::ADD16rm,       0 },
@@ -852,6 +948,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::CMPPSrri,        X86::CMPPSrmi,      TB_ALIGN_16 },
      { X86::CMPSDrr,         X86::CMPSDrm,       0 },
      { X86::CMPSSrr,         X86::CMPSSrm,       0 },
+    { X86::CRC32r32r32,     X86::CRC32r32m32,   0 },
+    { X86::CRC32r64r64,     X86::CRC32r64m64,   0 },
      { X86::DIVPDrr,         X86::DIVPDrm,       TB_ALIGN_16 },
      { X86::DIVPSrr,         X86::DIVPSrm,       TB_ALIGN_16 },
      { X86::DIVSDrr,         X86::DIVSDrm,       0 },
@@ -860,14 +958,19 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::DIVSSrr_Int,     X86::DIVSSrm_Int,   0 },
      { X86::DPPDrri,         X86::DPPDrmi,       TB_ALIGN_16 },
      { X86::DPPSrri,         X86::DPPSrmi,       TB_ALIGN_16 },
-    { X86::FsANDNPDrr,      X86::FsANDNPDrm,    TB_ALIGN_16 },
-    { X86::FsANDNPSrr,      X86::FsANDNPSrm,    TB_ALIGN_16 },
-    { X86::FsANDPDrr,       X86::FsANDPDrm,     TB_ALIGN_16 },
-    { X86::FsANDPSrr,       X86::FsANDPSrm,     TB_ALIGN_16 },
-    { X86::FsORPDrr,        X86::FsORPDrm,      TB_ALIGN_16 },
-    { X86::FsORPSrr,        X86::FsORPSrm,      TB_ALIGN_16 },
-    { X86::FsXORPDrr,       X86::FsXORPDrm,     TB_ALIGN_16 },
-    { X86::FsXORPSrr,       X86::FsXORPSrm,     TB_ALIGN_16 },
+
+    // Do not fold Fs* scalar logical op loads because there are no scalar
+    // load variants for these instructions. When folded, the load is required
+    // to be 128-bits, so the load size would not match.
+
+    { X86::FvANDNPDrr,      X86::FvANDNPDrm,    TB_ALIGN_16 },
+    { X86::FvANDNPSrr,      X86::FvANDNPSrm,    TB_ALIGN_16 },
+    { X86::FvANDPDrr,       X86::FvANDPDrm,     TB_ALIGN_16 },
+    { X86::FvANDPSrr,       X86::FvANDPSrm,     TB_ALIGN_16 },
+    { X86::FvORPDrr,        X86::FvORPDrm,      TB_ALIGN_16 },
+    { X86::FvORPSrr,        X86::FvORPSrm,      TB_ALIGN_16 },
+    { X86::FvXORPDrr,       X86::FvXORPDrm,     TB_ALIGN_16 },
+    { X86::FvXORPSrr,       X86::FvXORPSrm,     TB_ALIGN_16 },
      { X86::HADDPDrr,        X86::HADDPDrm,      TB_ALIGN_16 },
      { X86::HADDPSrr,        X86::HADDPSrm,      TB_ALIGN_16 },
      { X86::HSUBPDrr,        X86::HSUBPDrm,      TB_ALIGN_16 },
@@ -998,6 +1101,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::PUNPCKLQDQrr,    X86::PUNPCKLQDQrm,  TB_ALIGN_16 },
      { X86::PUNPCKLWDrr,     X86::PUNPCKLWDrm,   TB_ALIGN_16 },
      { X86::PXORrr,          X86::PXORrm,        TB_ALIGN_16 },
+    { X86::ROUNDSDr,        X86::ROUNDSDm,      0 },
+    { X86::ROUNDSSr,        X86::ROUNDSSm,      0 },
      { X86::SBB32rr,         X86::SBB32rm,       0 },
      { X86::SBB64rr,         X86::SBB64rm,       0 },
      { X86::SHUFPDrri,       X86::SHUFPDrmi,     TB_ALIGN_16 },
@@ -1023,6 +1128,98 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::XOR8rr,          X86::XOR8rm,        0 },
      { X86::XORPDrr,         X86::XORPDrm,       TB_ALIGN_16 },
      { X86::XORPSrr,         X86::XORPSrm,       TB_ALIGN_16 },
+
+    // MMX version of foldable instructions
+    { X86::MMX_CVTPI2PSirr,   X86::MMX_CVTPI2PSirm,   0 },
+    { X86::MMX_PACKSSDWirr,   X86::MMX_PACKSSDWirm,   0 },
+    { X86::MMX_PACKSSWBirr,   X86::MMX_PACKSSWBirm,   0 },
+    { X86::MMX_PACKUSWBirr,   X86::MMX_PACKUSWBirm,   0 },
+    { X86::MMX_PADDBirr,      X86::MMX_PADDBirm,      0 },
+    { X86::MMX_PADDDirr,      X86::MMX_PADDDirm,      0 },
+    { X86::MMX_PADDQirr,      X86::MMX_PADDQirm,      0 },
+    { X86::MMX_PADDSBirr,     X86::MMX_PADDSBirm,     0 },
+    { X86::MMX_PADDSWirr,     X86::MMX_PADDSWirm,     0 },
+    { X86::MMX_PADDUSBirr,    X86::MMX_PADDUSBirm,    0 },
+    { X86::MMX_PADDUSWirr,    X86::MMX_PADDUSWirm,    0 },
+    { X86::MMX_PADDWirr,      X86::MMX_PADDWirm,      0 },
+    { X86::MMX_PALIGNR64irr,  X86::MMX_PALIGNR64irm,  0 },
+    { X86::MMX_PANDNirr,      X86::MMX_PANDNirm,      0 },
+    { X86::MMX_PANDirr,       X86::MMX_PANDirm,       0 },
+    { X86::MMX_PAVGBirr,      X86::MMX_PAVGBirm,      0 },
+    { X86::MMX_PAVGWirr,      X86::MMX_PAVGWirm,      0 },
+    { X86::MMX_PCMPEQBirr,    X86::MMX_PCMPEQBirm,    0 },
+    { X86::MMX_PCMPEQDirr,    X86::MMX_PCMPEQDirm,    0 },
+    { X86::MMX_PCMPEQWirr,    X86::MMX_PCMPEQWirm,    0 },
+    { X86::MMX_PCMPGTBirr,    X86::MMX_PCMPGTBirm,    0 },
+    { X86::MMX_PCMPGTDirr,    X86::MMX_PCMPGTDirm,    0 },
+    { X86::MMX_PCMPGTWirr,    X86::MMX_PCMPGTWirm,    0 },
+    { X86::MMX_PHADDSWrr64,   X86::MMX_PHADDSWrm64,   0 },
+    { X86::MMX_PHADDWrr64,    X86::MMX_PHADDWrm64,    0 },
+    { X86::MMX_PHADDrr64,     X86::MMX_PHADDrm64,     0 },
+    { X86::MMX_PHSUBDrr64,    X86::MMX_PHSUBDrm64,    0 },
+    { X86::MMX_PHSUBSWrr64,   X86::MMX_PHSUBSWrm64,   0 },
+    { X86::MMX_PHSUBWrr64,    X86::MMX_PHSUBWrm64,    0 },
+    { X86::MMX_PINSRWirri,    X86::MMX_PINSRWirmi,    0 },
+    { X86::MMX_PMADDUBSWrr64, X86::MMX_PMADDUBSWrm64, 0 },
+    { X86::MMX_PMADDWDirr,    X86::MMX_PMADDWDirm,    0 },
+    { X86::MMX_PMAXSWirr,     X86::MMX_PMAXSWirm,     0 },
+    { X86::MMX_PMAXUBirr,     X86::MMX_PMAXUBirm,     0 },
+    { X86::MMX_PMINSWirr,     X86::MMX_PMINSWirm,     0 },
+    { X86::MMX_PMINUBirr,     X86::MMX_PMINUBirm,     0 },
+    { X86::MMX_PMULHRSWrr64,  X86::MMX_PMULHRSWrm64,  0 },
+    { X86::MMX_PMULHUWirr,    X86::MMX_PMULHUWirm,    0 },
+    { X86::MMX_PMULHWirr,     X86::MMX_PMULHWirm,     0 },
+    { X86::MMX_PMULLWirr,     X86::MMX_PMULLWirm,     0 },
+    { X86::MMX_PMULUDQirr,    X86::MMX_PMULUDQirm,    0 },
+    { X86::MMX_PORirr,        X86::MMX_PORirm,        0 },
+    { X86::MMX_PSADBWirr,     X86::MMX_PSADBWirm,     0 },
+    { X86::MMX_PSHUFBrr64,    X86::MMX_PSHUFBrm64,    0 },
+    { X86::MMX_PSIGNBrr64,    X86::MMX_PSIGNBrm64,    0 },
+    { X86::MMX_PSIGNDrr64,    X86::MMX_PSIGNDrm64,    0 },
+    { X86::MMX_PSIGNWrr64,    X86::MMX_PSIGNWrm64,    0 },
+    { X86::MMX_PSLLDrr,       X86::MMX_PSLLDrm,       0 },
+    { X86::MMX_PSLLQrr,       X86::MMX_PSLLQrm,       0 },
+    { X86::MMX_PSLLWrr,       X86::MMX_PSLLWrm,       0 },
+    { X86::MMX_PSRADrr,       X86::MMX_PSRADrm,       0 },
+    { X86::MMX_PSRAWrr,       X86::MMX_PSRAWrm,       0 },
+    { X86::MMX_PSRLDrr,       X86::MMX_PSRLDrm,       0 },
+    { X86::MMX_PSRLQrr,       X86::MMX_PSRLQrm,       0 },
+    { X86::MMX_PSRLWrr,       X86::MMX_PSRLWrm,       0 },
+    { X86::MMX_PSUBBirr,      X86::MMX_PSUBBirm,      0 },
+    { X86::MMX_PSUBDirr,      X86::MMX_PSUBDirm,      0 },
+    { X86::MMX_PSUBQirr,      X86::MMX_PSUBQirm,      0 },
+    { X86::MMX_PSUBSBirr,     X86::MMX_PSUBSBirm,     0 },
+    { X86::MMX_PSUBSWirr,     X86::MMX_PSUBSWirm,     0 },
+    { X86::MMX_PSUBUSBirr,    X86::MMX_PSUBUSBirm,    0 },
+    { X86::MMX_PSUBUSWirr,    X86::MMX_PSUBUSWirm,    0 },
+    { X86::MMX_PSUBWirr,      X86::MMX_PSUBWirm,      0 },
+    { X86::MMX_PUNPCKHBWirr,  X86::MMX_PUNPCKHBWirm,  0 },
+    { X86::MMX_PUNPCKHDQirr,  X86::MMX_PUNPCKHDQirm,  0 },
+    { X86::MMX_PUNPCKHWDirr,  X86::MMX_PUNPCKHWDirm,  0 },
+    { X86::MMX_PUNPCKLBWirr,  X86::MMX_PUNPCKLBWirm,  0 },
+    { X86::MMX_PUNPCKLDQirr,  X86::MMX_PUNPCKLDQirm,  0 },
+    { X86::MMX_PUNPCKLWDirr,  X86::MMX_PUNPCKLWDirm,  0 },
+    { X86::MMX_PXORirr,       X86::MMX_PXORirm,       0 },
+
+    // 3DNow! version of foldable instructions
+    { X86::PAVGUSBrr,         X86::PAVGUSBrm,         0 },
+    { X86::PFACCrr,           X86::PFACCrm,           0 },
+    { X86::PFADDrr,           X86::PFADDrm,           0 },
+    { X86::PFCMPEQrr,         X86::PFCMPEQrm,         0 },
+    { X86::PFCMPGErr,         X86::PFCMPGErm,         0 },
+    { X86::PFCMPGTrr,         X86::PFCMPGTrm,         0 },
+    { X86::PFMAXrr,           X86::PFMAXrm,           0 },
+    { X86::PFMINrr,           X86::PFMINrm,           0 },
+    { X86::PFMULrr,           X86::PFMULrm,           0 },
+    { X86::PFNACCrr,          X86::PFNACCrm,          0 },
+    { X86::PFPNACCrr,         X86::PFPNACCrm,         0 },
+    { X86::PFRCPIT1rr,        X86::PFRCPIT1rm,        0 },
+    { X86::PFRCPIT2rr,        X86::PFRCPIT2rm,        0 },
+    { X86::PFRSQIT1rr,        X86::PFRSQIT1rm,        0 },
+    { X86::PFSUBrr,           X86::PFSUBrm,           0 },
+    { X86::PFSUBRrr,          X86::PFSUBRrm,          0 },
+    { X86::PMULHRWrr,         X86::PMULHRWrm,         0 },
+
      // AVX 128-bit versions of foldable instructions
      { X86::VCVTSD2SSrr,       X86::VCVTSD2SSrm,        0 },
      { X86::Int_VCVTSD2SSrr,   X86::Int_VCVTSD2SSrm,    0 },
@@ -1037,9 +1234,13 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::VCVTSS2SDrr,       X86::VCVTSS2SDrm,        0 },
      { X86::Int_VCVTSS2SDrr,   X86::Int_VCVTSS2SDrm,    0 },
      { X86::VRCPSSr,           X86::VRCPSSm,            0 },
+    { X86::VRCPSSr_Int,       X86::VRCPSSm_Int,        0 },
      { X86::VRSQRTSSr,         X86::VRSQRTSSm,          0 },
+    { X86::VRSQRTSSr_Int,     X86::VRSQRTSSm_Int,      0 },
      { X86::VSQRTSDr,          X86::VSQRTSDm,           0 },
+    { X86::VSQRTSDr_Int,      X86::VSQRTSDm_Int,       0 },
      { X86::VSQRTSSr,          X86::VSQRTSSm,           0 },
+    { X86::VSQRTSSr_Int,      X86::VSQRTSSm_Int,       0 },
      { X86::VADDPDrr,          X86::VADDPDrm,           0 },
      { X86::VADDPSrr,          X86::VADDPSrm,           0 },
      { X86::VADDSDrr,          X86::VADDSDrm,           0 },
@@ -1068,14 +1269,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::VDIVSSrr_Int,      X86::VDIVSSrm_Int,       0 },
      { X86::VDPPDrri,          X86::VDPPDrmi,           0 },
      { X86::VDPPSrri,          X86::VDPPSrmi,           0 },
-    { X86::VFsANDNPDrr,       X86::VFsANDNPDrm,        TB_ALIGN_16 },
-    { X86::VFsANDNPSrr,       X86::VFsANDNPSrm,        TB_ALIGN_16 },
-    { X86::VFsANDPDrr,        X86::VFsANDPDrm,         TB_ALIGN_16 },
-    { X86::VFsANDPSrr,        X86::VFsANDPSrm,         TB_ALIGN_16 },
-    { X86::VFsORPDrr,         X86::VFsORPDrm,          TB_ALIGN_16 },
-    { X86::VFsORPSrr,         X86::VFsORPSrm,          TB_ALIGN_16 },
-    { X86::VFsXORPDrr,        X86::VFsXORPDrm,         TB_ALIGN_16 },
-    { X86::VFsXORPSrr,        X86::VFsXORPSrm,         TB_ALIGN_16 },
+    // Do not fold VFs* loads because there are no scalar load variants for
+    // these instructions. When folded, the load is required to be 128-bits, so
+    // the load size would not match.
+    { X86::VFvANDNPDrr,       X86::VFvANDNPDrm,        0 },
+    { X86::VFvANDNPSrr,       X86::VFvANDNPSrm,        0 },
+    { X86::VFvANDPDrr,        X86::VFvANDPDrm,         0 },
+    { X86::VFvANDPSrr,        X86::VFvANDPSrm,         0 },
+    { X86::VFvORPDrr,         X86::VFvORPDrm,          0 },
+    { X86::VFvORPSrr,         X86::VFvORPSrm,          0 },
+    { X86::VFvXORPDrr,        X86::VFvXORPDrm,         0 },
+    { X86::VFvXORPSrr,        X86::VFvXORPSrm,         0 },
      { X86::VHADDPDrr,         X86::VHADDPDrm,          0 },
      { X86::VHADDPSrr,         X86::VHADDPSrm,          0 },
      { X86::VHSUBPDrr,         X86::VHSUBPDrm,          0 },
@@ -1195,6 +1399,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::VPUNPCKLQDQrr,     X86::VPUNPCKLQDQrm,      0 },
      { X86::VPUNPCKLWDrr,      X86::VPUNPCKLWDrm,       0 },
      { X86::VPXORrr,           X86::VPXORrm,            0 },
+    { X86::VROUNDSDr,         X86::VROUNDSDm,          0 },
+    { X86::VROUNDSSr,         X86::VROUNDSSm,          0 },
      { X86::VSHUFPDrri,        X86::VSHUFPDrmi,         0 },
      { X86::VSHUFPSrri,        X86::VSHUFPSrmi,         0 },
      { X86::VSUBPDrr,          X86::VSUBPDrm,           0 },
@@ -1209,6 +1415,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::VUNPCKLPSrr,       X86::VUNPCKLPSrm,        0 },
      { X86::VXORPDrr,          X86::VXORPDrm,           0 },
      { X86::VXORPSrr,          X86::VXORPSrm,           0 },
+
      // AVX 256-bit foldable instructions
      { X86::VADDPDYrr,         X86::VADDPDYrm,          0 },
      { X86::VADDPSYrr,         X86::VADDPSYrm,          0 },
@@ -1253,6 +1460,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::VUNPCKLPSYrr,      X86::VUNPCKLPSYrm,       0 },
      { X86::VXORPDYrr,         X86::VXORPDYrm,          0 },
      { X86::VXORPSYrr,         X86::VXORPSYrm,          0 },
+
      // AVX2 foldable instructions
      { X86::VINSERTI128rr,     X86::VINSERTI128rm,      0 },
      { X86::VPACKSSDWYrr,      X86::VPACKSSDWYrm,       0 },
@@ -1274,6 +1482,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::VPAVGWYrr,         X86::VPAVGWYrm,          0 },
      { X86::VPBLENDDrri,       X86::VPBLENDDrmi,        0 },
      { X86::VPBLENDDYrri,      X86::VPBLENDDYrmi,       0 },
+    { X86::VPBLENDVBYrr,      X86::VPBLENDVBYrm,       0 },
      { X86::VPBLENDWYrri,      X86::VPBLENDWYrmi,       0 },
      { X86::VPCMPEQBYrr,       X86::VPCMPEQBYrm,        0 },
      { X86::VPCMPEQDYrr,       X86::VPCMPEQDYrm,        0 },
@@ -1285,9 +1494,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::VPCMPGTWYrr,       X86::VPCMPGTWYrm,        0 },
      { X86::VPERM2I128rr,      X86::VPERM2I128rm,       0 },
      { X86::VPERMDYrr,         X86::VPERMDYrm,          0 },
-    { X86::VPERMPDYri,        X86::VPERMPDYmi,         0 },
      { X86::VPERMPSYrr,        X86::VPERMPSYrm,         0 },
-    { X86::VPERMQYri,         X86::VPERMQYmi,          0 },
      { X86::VPHADDDYrr,        X86::VPHADDDYrm,         0 },
      { X86::VPHADDSWrr256,     X86::VPHADDSWrm256,      0 },
      { X86::VPHADDWYrr,        X86::VPHADDWYrm,         0 },
@@ -1342,8 +1549,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::VPSRLVQYrr,        X86::VPSRLVQYrm,         0 },
      { X86::VPSUBBYrr,         X86::VPSUBBYrm,          0 },
      { X86::VPSUBDYrr,         X86::VPSUBDYrm,          0 },
+    { X86::VPSUBQYrr,         X86::VPSUBQYrm,          0 },
      { X86::VPSUBSBYrr,        X86::VPSUBSBYrm,         0 },
      { X86::VPSUBSWYrr,        X86::VPSUBSWYrm,         0 },
+    { X86::VPSUBUSBYrr,       X86::VPSUBUSBYrm,        0 },
+    { X86::VPSUBUSWYrr,       X86::VPSUBUSWYrm,        0 },
      { X86::VPSUBWYrr,         X86::VPSUBWYrm,          0 },
      { X86::VPUNPCKHBWYrr,     X86::VPUNPCKHBWYrm,      0 },
      { X86::VPUNPCKHDQYrr,     X86::VPUNPCKHDQYrm,      0 },
@@ -1354,41 +1564,81 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::VPUNPCKLQDQYrr,    X86::VPUNPCKLQDQYrm,     0 },
      { X86::VPUNPCKLWDYrr,     X86::VPUNPCKLWDYrm,      0 },
      { X86::VPXORYrr,          X86::VPXORYrm,           0 },
-    // FIXME: add AVX 256-bit foldable instructions
  
      // FMA4 foldable patterns
-    { X86::VFMADDSS4rr,       X86::VFMADDSS4mr,        0           },
-    { X86::VFMADDSD4rr,       X86::VFMADDSD4mr,        0           },
-    { X86::VFMADDPS4rr,       X86::VFMADDPS4mr,        TB_ALIGN_16 },
-    { X86::VFMADDPD4rr,       X86::VFMADDPD4mr,        TB_ALIGN_16 },
-    { X86::VFMADDPS4rrY,      X86::VFMADDPS4mrY,       TB_ALIGN_32 },
-    { X86::VFMADDPD4rrY,      X86::VFMADDPD4mrY,       TB_ALIGN_32 },
-    { X86::VFNMADDSS4rr,      X86::VFNMADDSS4mr,       0           },
-    { X86::VFNMADDSD4rr,      X86::VFNMADDSD4mr,       0           },
-    { X86::VFNMADDPS4rr,      X86::VFNMADDPS4mr,       TB_ALIGN_16 },
-    { X86::VFNMADDPD4rr,      X86::VFNMADDPD4mr,       TB_ALIGN_16 },
-    { X86::VFNMADDPS4rrY,     X86::VFNMADDPS4mrY,      TB_ALIGN_32 },
-    { X86::VFNMADDPD4rrY,     X86::VFNMADDPD4mrY,      TB_ALIGN_32 },
-    { X86::VFMSUBSS4rr,       X86::VFMSUBSS4mr,        0           },
-    { X86::VFMSUBSD4rr,       X86::VFMSUBSD4mr,        0           },
-    { X86::VFMSUBPS4rr,       X86::VFMSUBPS4mr,        TB_ALIGN_16 },
-    { X86::VFMSUBPD4rr,       X86::VFMSUBPD4mr,        TB_ALIGN_16 },
-    { X86::VFMSUBPS4rrY,      X86::VFMSUBPS4mrY,       TB_ALIGN_32 },
-    { X86::VFMSUBPD4rrY,      X86::VFMSUBPD4mrY,       TB_ALIGN_32 },
-    { X86::VFNMSUBSS4rr,      X86::VFNMSUBSS4mr,       0           },
-    { X86::VFNMSUBSD4rr,      X86::VFNMSUBSD4mr,       0           },
-    { X86::VFNMSUBPS4rr,      X86::VFNMSUBPS4mr,       TB_ALIGN_16 },
-    { X86::VFNMSUBPD4rr,      X86::VFNMSUBPD4mr,       TB_ALIGN_16 },
-    { X86::VFNMSUBPS4rrY,     X86::VFNMSUBPS4mrY,      TB_ALIGN_32 },
-    { X86::VFNMSUBPD4rrY,     X86::VFNMSUBPD4mrY,      TB_ALIGN_32 },
-    { X86::VFMADDSUBPS4rr,    X86::VFMADDSUBPS4mr,     TB_ALIGN_16 },
-    { X86::VFMADDSUBPD4rr,    X86::VFMADDSUBPD4mr,     TB_ALIGN_16 },
-    { X86::VFMADDSUBPS4rrY,   X86::VFMADDSUBPS4mrY,    TB_ALIGN_32 },
-    { X86::VFMADDSUBPD4rrY,   X86::VFMADDSUBPD4mrY,    TB_ALIGN_32 },
-    { X86::VFMSUBADDPS4rr,    X86::VFMSUBADDPS4mr,     TB_ALIGN_16 },
-    { X86::VFMSUBADDPD4rr,    X86::VFMSUBADDPD4mr,     TB_ALIGN_16 },
-    { X86::VFMSUBADDPS4rrY,   X86::VFMSUBADDPS4mrY,    TB_ALIGN_32 },
-    { X86::VFMSUBADDPD4rrY,   X86::VFMSUBADDPD4mrY,    TB_ALIGN_32 },
+    { X86::VFMADDSS4rr,       X86::VFMADDSS4mr,        TB_ALIGN_NONE },
+    { X86::VFMADDSD4rr,       X86::VFMADDSD4mr,        TB_ALIGN_NONE },
+    { X86::VFMADDPS4rr,       X86::VFMADDPS4mr,        TB_ALIGN_NONE },
+    { X86::VFMADDPD4rr,       X86::VFMADDPD4mr,        TB_ALIGN_NONE },
+    { X86::VFMADDPS4rrY,      X86::VFMADDPS4mrY,       TB_ALIGN_NONE },
+    { X86::VFMADDPD4rrY,      X86::VFMADDPD4mrY,       TB_ALIGN_NONE },
+    { X86::VFNMADDSS4rr,      X86::VFNMADDSS4mr,       TB_ALIGN_NONE },
+    { X86::VFNMADDSD4rr,      X86::VFNMADDSD4mr,       TB_ALIGN_NONE },
+    { X86::VFNMADDPS4rr,      X86::VFNMADDPS4mr,       TB_ALIGN_NONE },
+    { X86::VFNMADDPD4rr,      X86::VFNMADDPD4mr,       TB_ALIGN_NONE },
+    { X86::VFNMADDPS4rrY,     X86::VFNMADDPS4mrY,      TB_ALIGN_NONE },
+    { X86::VFNMADDPD4rrY,     X86::VFNMADDPD4mrY,      TB_ALIGN_NONE },
+    { X86::VFMSUBSS4rr,       X86::VFMSUBSS4mr,        TB_ALIGN_NONE },
+    { X86::VFMSUBSD4rr,       X86::VFMSUBSD4mr,        TB_ALIGN_NONE },
+    { X86::VFMSUBPS4rr,       X86::VFMSUBPS4mr,        TB_ALIGN_NONE },
+    { X86::VFMSUBPD4rr,       X86::VFMSUBPD4mr,        TB_ALIGN_NONE },
+    { X86::VFMSUBPS4rrY,      X86::VFMSUBPS4mrY,       TB_ALIGN_NONE },
+    { X86::VFMSUBPD4rrY,      X86::VFMSUBPD4mrY,       TB_ALIGN_NONE },
+    { X86::VFNMSUBSS4rr,      X86::VFNMSUBSS4mr,       TB_ALIGN_NONE },
+    { X86::VFNMSUBSD4rr,      X86::VFNMSUBSD4mr,       TB_ALIGN_NONE },
+    { X86::VFNMSUBPS4rr,      X86::VFNMSUBPS4mr,       TB_ALIGN_NONE },
+    { X86::VFNMSUBPD4rr,      X86::VFNMSUBPD4mr,       TB_ALIGN_NONE },
+    { X86::VFNMSUBPS4rrY,     X86::VFNMSUBPS4mrY,      TB_ALIGN_NONE },
+    { X86::VFNMSUBPD4rrY,     X86::VFNMSUBPD4mrY,      TB_ALIGN_NONE },
+    { X86::VFMADDSUBPS4rr,    X86::VFMADDSUBPS4mr,     TB_ALIGN_NONE },
+    { X86::VFMADDSUBPD4rr,    X86::VFMADDSUBPD4mr,     TB_ALIGN_NONE },
+    { X86::VFMADDSUBPS4rrY,   X86::VFMADDSUBPS4mrY,    TB_ALIGN_NONE },
+    { X86::VFMADDSUBPD4rrY,   X86::VFMADDSUBPD4mrY,    TB_ALIGN_NONE },
+    { X86::VFMSUBADDPS4rr,    X86::VFMSUBADDPS4mr,     TB_ALIGN_NONE },
+    { X86::VFMSUBADDPD4rr,    X86::VFMSUBADDPD4mr,     TB_ALIGN_NONE },
+    { X86::VFMSUBADDPS4rrY,   X86::VFMSUBADDPS4mrY,    TB_ALIGN_NONE },
+    { X86::VFMSUBADDPD4rrY,   X86::VFMSUBADDPD4mrY,    TB_ALIGN_NONE },
+
+    // XOP foldable instructions
+    { X86::VPCMOVrr,          X86::VPCMOVmr,            0 },
+    { X86::VPCMOVrrY,         X86::VPCMOVmrY,           0 },
+    { X86::VPCOMBri,          X86::VPCOMBmi,            0 },
+    { X86::VPCOMDri,          X86::VPCOMDmi,            0 },
+    { X86::VPCOMQri,          X86::VPCOMQmi,            0 },
+    { X86::VPCOMWri,          X86::VPCOMWmi,            0 },
+    { X86::VPCOMUBri,         X86::VPCOMUBmi,           0 },
+    { X86::VPCOMUDri,         X86::VPCOMUDmi,           0 },
+    { X86::VPCOMUQri,         X86::VPCOMUQmi,           0 },
+    { X86::VPCOMUWri,         X86::VPCOMUWmi,           0 },
+    { X86::VPERMIL2PDrr,      X86::VPERMIL2PDmr,        0 },
+    { X86::VPERMIL2PDrrY,     X86::VPERMIL2PDmrY,       0 },
+    { X86::VPERMIL2PSrr,      X86::VPERMIL2PSmr,        0 },
+    { X86::VPERMIL2PSrrY,     X86::VPERMIL2PSmrY,       0 },
+    { X86::VPMACSDDrr,        X86::VPMACSDDrm,          0 },
+    { X86::VPMACSDQHrr,       X86::VPMACSDQHrm,         0 },
+    { X86::VPMACSDQLrr,       X86::VPMACSDQLrm,         0 },
+    { X86::VPMACSSDDrr,       X86::VPMACSSDDrm,         0 },
+    { X86::VPMACSSDQHrr,      X86::VPMACSSDQHrm,        0 },
+    { X86::VPMACSSDQLrr,      X86::VPMACSSDQLrm,        0 },
+    { X86::VPMACSSWDrr,       X86::VPMACSSWDrm,         0 },
+    { X86::VPMACSSWWrr,       X86::VPMACSSWWrm,         0 },
+    { X86::VPMACSWDrr,        X86::VPMACSWDrm,          0 },
+    { X86::VPMACSWWrr,        X86::VPMACSWWrm,          0 },
+    { X86::VPMADCSSWDrr,      X86::VPMADCSSWDrm,        0 },
+    { X86::VPMADCSWDrr,       X86::VPMADCSWDrm,         0 },
+    { X86::VPPERMrr,          X86::VPPERMmr,            0 },
+    { X86::VPROTBrr,          X86::VPROTBrm,            0 },
+    { X86::VPROTDrr,          X86::VPROTDrm,            0 },
+    { X86::VPROTQrr,          X86::VPROTQrm,            0 },
+    { X86::VPROTWrr,          X86::VPROTWrm,            0 },
+    { X86::VPSHABrr,          X86::VPSHABrm,            0 },
+    { X86::VPSHADrr,          X86::VPSHADrm,            0 },
+    { X86::VPSHAQrr,          X86::VPSHAQrm,            0 },
+    { X86::VPSHAWrr,          X86::VPSHAWrm,            0 },
+    { X86::VPSHLBrr,          X86::VPSHLBrm,            0 },
+    { X86::VPSHLDrr,          X86::VPSHLDrm,            0 },
+    { X86::VPSHLQrr,          X86::VPSHLQrm,            0 },
+    { X86::VPSHLWrr,          X86::VPSHLWrm,            0 },
  
      // BMI/BMI2 foldable instructions
      { X86::ANDN32rr,          X86::ANDN32rm,            0 },
@@ -1435,8 +1685,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::VPSUBQZrr,         X86::VPSUBQZrm,           0 },
      { X86::VSHUFPDZrri,       X86::VSHUFPDZrmi,         0 },
      { X86::VSHUFPSZrri,       X86::VSHUFPSZrmi,         0 },
-    { X86::VALIGNQrri,        X86::VALIGNQrmi,          0 },
-    { X86::VALIGNDrri,        X86::VALIGNDrmi,          0 },
+    { X86::VALIGNQZrri,       X86::VALIGNQZrmi,         0 },
+    { X86::VALIGNDZrri,       X86::VALIGNDZrmi,         0 },
      { X86::VPMULUDQZrr,       X86::VPMULUDQZrm,         0 },
      { X86::VBROADCASTSSZrkz,  X86::VBROADCASTSSZmkz,    TB_NO_REVERSE },
      { X86::VBROADCASTSDZrkz,  X86::VBROADCASTSDZmkz,    TB_NO_REVERSE },
@@ -1457,10 +1707,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::AESDECrr,          X86::AESDECrm,            TB_ALIGN_16 },
      { X86::AESENCLASTrr,      X86::AESENCLASTrm,        TB_ALIGN_16 },
      { X86::AESENCrr,          X86::AESENCrm,            TB_ALIGN_16 },
-    { X86::VAESDECLASTrr,     X86::VAESDECLASTrm,       TB_ALIGN_16 },
-    { X86::VAESDECrr,         X86::VAESDECrm,           TB_ALIGN_16 },
-    { X86::VAESENCLASTrr,     X86::VAESENCLASTrm,       TB_ALIGN_16 },
-    { X86::VAESENCrr,         X86::VAESENCrm,           TB_ALIGN_16 },
+    { X86::VAESDECLASTrr,     X86::VAESDECLASTrm,       0 },
+    { X86::VAESDECrr,         X86::VAESDECrm,           0 },
+    { X86::VAESENCLASTrr,     X86::VAESENCLASTrm,       0 },
+    { X86::VAESENCrr,         X86::VAESENCrm,           0 },
  
      // SHA foldable instructions
      { X86::SHA1MSG1rr,        X86::SHA1MSG1rm,          TB_ALIGN_16 },
@@ -1469,20 +1719,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::SHA1RNDS4rri,      X86::SHA1RNDS4rmi,        TB_ALIGN_16 },
      { X86::SHA256MSG1rr,      X86::SHA256MSG1rm,        TB_ALIGN_16 },
      { X86::SHA256MSG2rr,      X86::SHA256MSG2rm,        TB_ALIGN_16 },
-    { X86::SHA256RNDS2rr,     X86::SHA256RNDS2rm,       TB_ALIGN_16 },
+    { X86::SHA256RNDS2rr,     X86::SHA256RNDS2rm,       TB_ALIGN_16 }
    };
  
-  for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) {
-    unsigned RegOp = OpTbl2[i].RegOp;
-    unsigned MemOp = OpTbl2[i].MemOp;
-    unsigned Flags = OpTbl2[i].Flags;
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2) {
      AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable,
-                  RegOp, MemOp,
+                  Entry.RegOp, Entry.MemOp,
                    // Index 2, folded load
-                  Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
+                  Entry.Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
    }
  
-  static const X86OpTblEntry OpTbl3[] = {
+  static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
      // FMA foldable instructions
      { X86::VFMADDSSr231r,         X86::VFMADDSSr231m,         TB_ALIGN_NONE },
      { X86::VFMADDSDr231r,         X86::VFMADDSDr231m,         TB_ALIGN_NONE },
@@ -1591,38 +1838,48 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::VFMSUBADDPDr213rY,     X86::VFMSUBADDPDr213mY,     TB_ALIGN_NONE },
  
      // FMA4 foldable patterns
-    { X86::VFMADDSS4rr,           X86::VFMADDSS4rm,           0           },
-    { X86::VFMADDSD4rr,           X86::VFMADDSD4rm,           0           },
-    { X86::VFMADDPS4rr,           X86::VFMADDPS4rm,           TB_ALIGN_16 },
-    { X86::VFMADDPD4rr,           X86::VFMADDPD4rm,           TB_ALIGN_16 },
-    { X86::VFMADDPS4rrY,          X86::VFMADDPS4rmY,          TB_ALIGN_32 },
-    { X86::VFMADDPD4rrY,          X86::VFMADDPD4rmY,          TB_ALIGN_32 },
-    { X86::VFNMADDSS4rr,          X86::VFNMADDSS4rm,          0           },
-    { X86::VFNMADDSD4rr,          X86::VFNMADDSD4rm,          0           },
-    { X86::VFNMADDPS4rr,          X86::VFNMADDPS4rm,          TB_ALIGN_16 },
-    { X86::VFNMADDPD4rr,          X86::VFNMADDPD4rm,          TB_ALIGN_16 },
-    { X86::VFNMADDPS4rrY,         X86::VFNMADDPS4rmY,         TB_ALIGN_32 },
-    { X86::VFNMADDPD4rrY,         X86::VFNMADDPD4rmY,         TB_ALIGN_32 },
-    { X86::VFMSUBSS4rr,           X86::VFMSUBSS4rm,           0           },
-    { X86::VFMSUBSD4rr,           X86::VFMSUBSD4rm,           0           },
-    { X86::VFMSUBPS4rr,           X86::VFMSUBPS4rm,           TB_ALIGN_16 },
-    { X86::VFMSUBPD4rr,           X86::VFMSUBPD4rm,           TB_ALIGN_16 },
-    { X86::VFMSUBPS4rrY,          X86::VFMSUBPS4rmY,          TB_ALIGN_32 },
-    { X86::VFMSUBPD4rrY,          X86::VFMSUBPD4rmY,          TB_ALIGN_32 },
-    { X86::VFNMSUBSS4rr,          X86::VFNMSUBSS4rm,          0           },
-    { X86::VFNMSUBSD4rr,          X86::VFNMSUBSD4rm,          0           },
-    { X86::VFNMSUBPS4rr,          X86::VFNMSUBPS4rm,          TB_ALIGN_16 },
-    { X86::VFNMSUBPD4rr,          X86::VFNMSUBPD4rm,          TB_ALIGN_16 },
-    { X86::VFNMSUBPS4rrY,         X86::VFNMSUBPS4rmY,         TB_ALIGN_32 },
-    { X86::VFNMSUBPD4rrY,         X86::VFNMSUBPD4rmY,         TB_ALIGN_32 },
-    { X86::VFMADDSUBPS4rr,        X86::VFMADDSUBPS4rm,        TB_ALIGN_16 },
-    { X86::VFMADDSUBPD4rr,        X86::VFMADDSUBPD4rm,        TB_ALIGN_16 },
-    { X86::VFMADDSUBPS4rrY,       X86::VFMADDSUBPS4rmY,       TB_ALIGN_32 },
-    { X86::VFMADDSUBPD4rrY,       X86::VFMADDSUBPD4rmY,       TB_ALIGN_32 },
-    { X86::VFMSUBADDPS4rr,        X86::VFMSUBADDPS4rm,        TB_ALIGN_16 },
-    { X86::VFMSUBADDPD4rr,        X86::VFMSUBADDPD4rm,        TB_ALIGN_16 },
-    { X86::VFMSUBADDPS4rrY,       X86::VFMSUBADDPS4rmY,       TB_ALIGN_32 },
-    { X86::VFMSUBADDPD4rrY,       X86::VFMSUBADDPD4rmY,       TB_ALIGN_32 },
+    { X86::VFMADDSS4rr,           X86::VFMADDSS4rm,           TB_ALIGN_NONE },
+    { X86::VFMADDSD4rr,           X86::VFMADDSD4rm,           TB_ALIGN_NONE },
+    { X86::VFMADDPS4rr,           X86::VFMADDPS4rm,           TB_ALIGN_NONE },
+    { X86::VFMADDPD4rr,           X86::VFMADDPD4rm,           TB_ALIGN_NONE },
+    { X86::VFMADDPS4rrY,          X86::VFMADDPS4rmY,          TB_ALIGN_NONE },
+    { X86::VFMADDPD4rrY,          X86::VFMADDPD4rmY,          TB_ALIGN_NONE },
+    { X86::VFNMADDSS4rr,          X86::VFNMADDSS4rm,          TB_ALIGN_NONE },
+    { X86::VFNMADDSD4rr,          X86::VFNMADDSD4rm,          TB_ALIGN_NONE },
+    { X86::VFNMADDPS4rr,          X86::VFNMADDPS4rm,          TB_ALIGN_NONE },
+    { X86::VFNMADDPD4rr,          X86::VFNMADDPD4rm,          TB_ALIGN_NONE },
+    { X86::VFNMADDPS4rrY,         X86::VFNMADDPS4rmY,         TB_ALIGN_NONE },
+    { X86::VFNMADDPD4rrY,         X86::VFNMADDPD4rmY,         TB_ALIGN_NONE },
+    { X86::VFMSUBSS4rr,           X86::VFMSUBSS4rm,           TB_ALIGN_NONE },
+    { X86::VFMSUBSD4rr,           X86::VFMSUBSD4rm,           TB_ALIGN_NONE },
+    { X86::VFMSUBPS4rr,           X86::VFMSUBPS4rm,           TB_ALIGN_NONE },
+    { X86::VFMSUBPD4rr,           X86::VFMSUBPD4rm,           TB_ALIGN_NONE },
+    { X86::VFMSUBPS4rrY,          X86::VFMSUBPS4rmY,          TB_ALIGN_NONE },
+    { X86::VFMSUBPD4rrY,          X86::VFMSUBPD4rmY,          TB_ALIGN_NONE },
+    { X86::VFNMSUBSS4rr,          X86::VFNMSUBSS4rm,          TB_ALIGN_NONE },
+    { X86::VFNMSUBSD4rr,          X86::VFNMSUBSD4rm,          TB_ALIGN_NONE },
+    { X86::VFNMSUBPS4rr,          X86::VFNMSUBPS4rm,          TB_ALIGN_NONE },
+    { X86::VFNMSUBPD4rr,          X86::VFNMSUBPD4rm,          TB_ALIGN_NONE },
+    { X86::VFNMSUBPS4rrY,         X86::VFNMSUBPS4rmY,         TB_ALIGN_NONE },
+    { X86::VFNMSUBPD4rrY,         X86::VFNMSUBPD4rmY,         TB_ALIGN_NONE },
+    { X86::VFMADDSUBPS4rr,        X86::VFMADDSUBPS4rm,        TB_ALIGN_NONE },
+    { X86::VFMADDSUBPD4rr,        X86::VFMADDSUBPD4rm,        TB_ALIGN_NONE },
+    { X86::VFMADDSUBPS4rrY,       X86::VFMADDSUBPS4rmY,       TB_ALIGN_NONE },
+    { X86::VFMADDSUBPD4rrY,       X86::VFMADDSUBPD4rmY,       TB_ALIGN_NONE },
+    { X86::VFMSUBADDPS4rr,        X86::VFMSUBADDPS4rm,        TB_ALIGN_NONE },
+    { X86::VFMSUBADDPD4rr,        X86::VFMSUBADDPD4rm,        TB_ALIGN_NONE },
+    { X86::VFMSUBADDPS4rrY,       X86::VFMSUBADDPS4rmY,       TB_ALIGN_NONE },
+    { X86::VFMSUBADDPD4rrY,       X86::VFMSUBADDPD4rmY,       TB_ALIGN_NONE },
+
+    // XOP foldable instructions
+    { X86::VPCMOVrr,              X86::VPCMOVrm,              0 },
+    { X86::VPCMOVrrY,             X86::VPCMOVrmY,             0 },
+    { X86::VPERMIL2PDrr,          X86::VPERMIL2PDrm,          0 },
+    { X86::VPERMIL2PDrrY,         X86::VPERMIL2PDrmY,         0 },
+    { X86::VPERMIL2PSrr,          X86::VPERMIL2PSrm,          0 },
+    { X86::VPERMIL2PSrrY,         X86::VPERMIL2PSrmY,         0 },
+    { X86::VPPERMrr,              X86::VPPERMrm,              0 },
+
      // AVX-512 VPERMI instructions with 3 source operands.
      { X86::VPERMI2Drr,            X86::VPERMI2Drm,            0 },
      { X86::VPERMI2Qrr,            X86::VPERMI2Qrm,            0 },
@@ -1678,17 +1935,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::VMAXPDZ128rrkz,        X86::VMAXPDZ128rmkz,        0 }
    };
  
-  for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) {
-    unsigned RegOp = OpTbl3[i].RegOp;
-    unsigned MemOp = OpTbl3[i].MemOp;
-    unsigned Flags = OpTbl3[i].Flags;
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) {
      AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
-                  RegOp, MemOp,
+                  Entry.RegOp, Entry.MemOp,
                    // Index 3, folded load
-                  Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
+                  Entry.Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
    }
  
-  static const X86OpTblEntry OpTbl4[] = {
+  static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
       // AVX-512 foldable instructions
      { X86::VADDPSZrrk,         X86::VADDPSZrmk,           0 },
      { X86::VADDPDZrrk,         X86::VADDPDZrmk,           0 },
@@ -1730,14 +1984,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::VMAXPDZ128rrk,      X86::VMAXPDZ128rmk,        0 }
    };
  
-  for (unsigned i = 0, e = array_lengthof(OpTbl4); i != e; ++i) {
-    unsigned RegOp = OpTbl4[i].RegOp;
-    unsigned MemOp = OpTbl4[i].MemOp;
-    unsigned Flags = OpTbl4[i].Flags;
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable4) {
      AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
-                  RegOp, MemOp,
+                  Entry.RegOp, Entry.MemOp,
                    // Index 4, folded load
-                  Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
+                  Entry.Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
    }
  }
  
@@ -1811,7 +2062,7 @@ int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const {
    if (MI->getOpcode() == getCallFrameSetupOpcode() ||
        MI->getOpcode() == getCallFrameDestroyOpcode()) {
      unsigned StackAlign = TFI->getStackAlignment();
-    int SPAdj = (MI->getOperand(0).getImm() + StackAlign - 1) / StackAlign * 
+    int SPAdj = (MI->getOperand(0).getImm() + StackAlign - 1) / StackAlign *
                   StackAlign;
  
      SPAdj -= MI->getOperand(1).getImm();
@@ -1821,8 +2072,8 @@ int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const {
      else
        return -SPAdj;
    }
-  
-  // To know whether a call adjusts the stack, we need information 
+
+  // To know whether a call adjusts the stack, we need information
    // that is bound to the following ADJCALLSTACKUP pseudo.
    // Look for the next ADJCALLSTACKUP that follows the call.
    if (MI->isCall()) {
@@ -1845,7 +2096,7 @@ int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const {
    // Currently handle only PUSHes we can reasonably expect to see
    // in call sequences
    switch (MI->getOpcode()) {
-  default: 
+  default:
      return 0;
    case X86::PUSH32i8:
    case X86::PUSH32r:
@@ -1856,7 +2107,7 @@ int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const {
    }
  }
  
-/// isFrameOperand - Return true and the FrameIndex if the specified
+/// Return true and the FrameIndex if the specified
  /// operand and follow operands form a reference to the stack frame.
  bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op,
                                    int &FrameIndex) const {
@@ -1983,8 +2234,7 @@ unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI,
    return 0;
  }
  
-/// regIsPICBase - Return true if register is PIC base (i.e.g defined by
-/// X86::MOVPC32r.
+/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
  static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
    // Don't waste compile time scanning use-def chains of physregs.
    if (!TargetRegisterInfo::isVirtualRegister(BaseReg))
@@ -2180,8 +2430,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
    NewMI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI);
  }
  
-/// hasLiveCondCodeDef - True if MI has a condition code def, e.g. EFLAGS, that
-/// is not marked dead.
+/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
  static bool hasLiveCondCodeDef(MachineInstr *MI) {
    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
      MachineOperand &MO = MI->getOperand(i);
@@ -2193,8 +2442,7 @@ static bool hasLiveCondCodeDef(MachineInstr *MI) {
    return false;
  }
  
-/// getTruncatedShiftCount - check whether the shift count for a machine operand
-/// is non-zero.
+/// Check whether the shift count for a machine operand is non-zero.
  inline static unsigned getTruncatedShiftCount(MachineInstr *MI,
                                                unsigned ShiftAmtOperandIdx) {
    // The shift count is six bits with the REX.W prefix and five bits without.
@@ -2203,12 +2451,12 @@ inline static unsigned getTruncatedShiftCount(MachineInstr *MI,
    return Imm & ShiftCountMask;
  }
  
-/// isTruncatedShiftCountForLEA - check whether the given shift count is appropriate
+/// Check whether the given shift count is appropriate
  /// can be represented by a LEA instruction.
  inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
    // Left shift instructions can be transformed into load-effective-address
    // instructions if we can encode them appropriately.
-  // A LEA instruction utilizes a SIB byte to encode it's scale factor.
+  // A LEA instruction utilizes a SIB byte to encode its scale factor.
    // The SIB.scale field is two bits wide which means that we can encode any
    // shift amount less than 4.
    return ShAmt < 4 && ShAmt > 0;
@@ -2285,10 +2533,9 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src,
    return true;
  }
  
-/// convertToThreeAddressWithLEA - Helper for convertToThreeAddress when
-/// 16-bit LEA is disabled, use 32-bit LEA to form 3-address code by promoting
-/// to a 32-bit superregister and then truncating back down to a 16-bit
-/// subregister.
+/// Helper for convertToThreeAddress when 16-bit LEA is disabled, use 32-bit
+/// LEA to form 3-address code by promoting to a 32-bit superregister and then
+/// truncating back down to a 16-bit subregister.
  MachineInstr *
  X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
                                             MachineFunction::iterator &MFI,
@@ -2395,7 +2642,7 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
    return ExtMI;
  }
  
-/// convertToThreeAddress - This method must be implemented by targets that
+/// This method must be implemented by targets that
  /// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
  /// may be able to convert a two-address instruction into a true
  /// three-address instruction on demand.  This allows the X86 target (for
@@ -2670,8 +2917,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
    return NewMI;
  }
  
-/// commuteInstruction - We have a few instructions that must be hacked on to
-/// commute them.
+/// We have a few instructions that must be hacked on to commute them.
  ///
  MachineInstr *
  X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
@@ -2778,6 +3024,32 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
        return nullptr;
      }
    }
+  case X86::VPCOMBri: case X86::VPCOMUBri:
+  case X86::VPCOMDri: case X86::VPCOMUDri:
+  case X86::VPCOMQri: case X86::VPCOMUQri:
+  case X86::VPCOMWri: case X86::VPCOMUWri: {
+    // Flip comparison mode immediate (if necessary).
+    unsigned Imm = MI->getOperand(3).getImm() & 0x7;
+    switch (Imm) {
+    case 0x00: Imm = 0x02; break; // LT -> GT
+    case 0x01: Imm = 0x03; break; // LE -> GE
+    case 0x02: Imm = 0x00; break; // GT -> LT
+    case 0x03: Imm = 0x01; break; // GE -> LE
+    case 0x04: // EQ
+    case 0x05: // NE
+    case 0x06: // FALSE
+    case 0x07: // TRUE
+    default:
+      break;
+    }
+    if (NewMI) {
+      MachineFunction &MF = *MI->getParent()->getParent();
+      MI = MF.CloneMachineInstr(MI);
+      NewMI = false;
+    }
+    MI->getOperand(3).setImm(Imm);
+    return TargetInstrInfo::commuteInstruction(MI, NewMI);
+  }
    case X86::CMOVB16rr:  case X86::CMOVB32rr:  case X86::CMOVB64rr:
    case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
    case X86::CMOVE16rr:  case X86::CMOVE32rr:  case X86::CMOVE64rr:
@@ -2936,7 +3208,7 @@ static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
    }
  }
  
-/// getCondFromSETOpc - return condition code of a SET opcode.
+/// Return condition code of a SET opcode.
  static X86::CondCode getCondFromSETOpc(unsigned Opc) {
    switch (Opc) {
    default: return X86::COND_INVALID;
@@ -2959,7 +3231,7 @@ static X86::CondCode getCondFromSETOpc(unsigned Opc) {
    }
  }
  
-/// getCondFromCmovOpc - return condition code of a CMov opcode.
+/// Return condition code of a CMov opcode.
  X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) {
    switch (Opc) {
    default: return X86::COND_INVALID;
@@ -3036,7 +3308,7 @@ unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
    }
  }
  
-/// GetOppositeBranchCondition - Return the inverse of the specified condition,
+/// Return the inverse of the specified condition,
  /// e.g. turning COND_E to COND_NE.
  X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
    switch (CC) {
@@ -3060,9 +3332,8 @@ X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
    }
  }
  
-/// getSwappedCondition - assume the flags are set by MI(a,b), return
-/// the condition code if we modify the instructions such that flags are
-/// set by MI(b,a).
+/// Assuming the flags are set by MI(a,b), return the condition code if we
+/// modify the instructions such that flags are set by MI(b,a).
  static X86::CondCode getSwappedCondition(X86::CondCode CC) {
    switch (CC) {
    default: return X86::COND_INVALID;
@@ -3079,7 +3350,7 @@ static X86::CondCode getSwappedCondition(X86::CondCode CC) {
    }
  }
  
-/// getSETFromCond - Return a set opcode for the given condition and
+/// Return a set opcode for the given condition and
  /// whether it has memory operand.
  unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) {
    static const uint16_t Opc[16][2] = {
@@ -3105,7 +3376,7 @@ unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) {
    return Opc[CC][HasMemoryOperand ? 1 : 0];
  }
  
-/// getCMovFromCond - Return a cmov opcode for the given condition,
+/// Return a cmov opcode for the given condition,
  /// register size in bytes, and operand type.
  unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes,
                                bool HasMemoryOperand) {
@@ -3165,11 +3436,11 @@ bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
    return !isPredicated(MI);
  }
  
-bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
-                                 MachineBasicBlock *&TBB,
-                                 MachineBasicBlock *&FBB,
-                                 SmallVectorImpl<MachineOperand> &Cond,
-                                 bool AllowModify) const {
+bool X86InstrInfo::AnalyzeBranchImpl(
+    MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+    SmallVectorImpl<MachineOperand> &Cond,
+    SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const {
+
    // Start from the bottom of the block and work up, examining the
    // terminator instructions.
    MachineBasicBlock::iterator I = MBB.end();
@@ -3267,6 +3538,7 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
        FBB = TBB;
        TBB = I->getOperand(0).getMBB();
        Cond.push_back(MachineOperand::CreateImm(BranchCode));
+      CondBranches.push_back(I);
        continue;
      }
  
@@ -3304,11 +3576,90 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
  
      // Update the MachineOperand.
      Cond[0].setImm(BranchCode);
+    CondBranches.push_back(I);
    }
  
    return false;
  }
  
+bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                 MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 bool AllowModify) const {
+  SmallVector<MachineInstr *, 4> CondBranches;
+  return AnalyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
+}
+
+bool X86InstrInfo::AnalyzeBranchPredicate(MachineBasicBlock &MBB,
+                                          MachineBranchPredicate &MBP,
+                                          bool AllowModify) const {
+  using namespace std::placeholders;
+
+  SmallVector<MachineOperand, 4> Cond;
+  SmallVector<MachineInstr *, 4> CondBranches;
+  if (AnalyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches,
+                        AllowModify))
+    return true;
+
+  if (Cond.size() != 1)
+    return true;
+
+  assert(MBP.TrueDest && "expected!");
+
+  if (!MBP.FalseDest)
+    MBP.FalseDest = MBB.getNextNode();
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+  MachineInstr *ConditionDef = nullptr;
+  bool SingleUseCondition = true;
+
+  for (auto I = std::next(MBB.rbegin()), E = MBB.rend(); I != E; ++I) {
+    if (I->modifiesRegister(X86::EFLAGS, TRI)) {
+      ConditionDef = &*I;
+      break;
+    }
+
+    if (I->readsRegister(X86::EFLAGS, TRI))
+      SingleUseCondition = false;
+  }
+
+  if (!ConditionDef)
+    return true;
+
+  if (SingleUseCondition) {
+    for (auto *Succ : MBB.successors())
+      if (Succ->isLiveIn(X86::EFLAGS))
+        SingleUseCondition = false;
+  }
+
+  MBP.ConditionDef = ConditionDef;
+  MBP.SingleUseCondition = SingleUseCondition;
+
+  // Currently we only recognize the simple pattern:
+  //
+  //   test %reg, %reg
+  //   je %label
+  //
+  const unsigned TestOpcode =
+      Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr;
+
+  if (ConditionDef->getOpcode() == TestOpcode &&
+      ConditionDef->getNumOperands() == 3 &&
+      ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) &&
+      (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) {
+    MBP.LHS = ConditionDef->getOperand(0);
+    MBP.RHS = MachineOperand::CreateImm(0);
+    MBP.Predicate = Cond[0].getImm() == X86::COND_NE
+                        ? MachineBranchPredicate::PRED_NE
+                        : MachineBranchPredicate::PRED_EQ;
+    return false;
+  }
+
+  return true;
+}
+
  unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
    MachineBasicBlock::iterator I = MBB.end();
    unsigned Count = 0;
@@ -3331,8 +3682,7 @@ unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
  
  unsigned
  X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                           MachineBasicBlock *FBB,
-                           const SmallVectorImpl<MachineOperand> &Cond,
+                           MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
                             DebugLoc DL) const {
    // Shouldn't be a fall through.
    assert(TBB && "InsertBranch must not be told to insert a fallthrough");
@@ -3380,7 +3730,7 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
  
  bool X86InstrInfo::
  canInsertSelect(const MachineBasicBlock &MBB,
-                const SmallVectorImpl<MachineOperand> &Cond,
+                ArrayRef<MachineOperand> Cond,
                  unsigned TrueReg, unsigned FalseReg,
                  int &CondCycles, int &TrueCycles, int &FalseCycles) const {
    // Not all subtargets have cmov instructions.
@@ -3417,8 +3767,7 @@ canInsertSelect(const MachineBasicBlock &MBB,
  
  void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator I, DebugLoc DL,
-                                unsigned DstReg,
-                                const SmallVectorImpl<MachineOperand> &Cond,
+                                unsigned DstReg, ArrayRef<MachineOperand> Cond,
                                  unsigned TrueReg, unsigned FalseReg) const {
     MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
     assert(Cond.size() == 1 && "Invalid Cond array");
@@ -3428,7 +3777,7 @@ void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
     BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg);
  }
  
-/// isHReg - Test if the given register is a physical h register.
+/// Test if the given register is a physical h register.
  static bool isHReg(unsigned Reg) {
    return X86::GR8_ABCD_HRegClass.contains(Reg);
  }
@@ -3451,7 +3800,7 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
                                                 X86::MOVPQIto64rr);
      if (X86::VR64RegClass.contains(SrcReg))
        // Copy from a VR64 register to a GR64 register.
-      return X86::MOVSDto64rr;
+      return X86::MMX_MOVD64from64rr;
    } else if (X86::GR64RegClass.contains(SrcReg)) {
      // Copy from a GR64 register to a VR128 register.
      if (X86::VR128XRegClass.contains(DestReg))
@@ -3459,7 +3808,7 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
                                                 X86::MOV64toPQIrr);
      // Copy from a GR64 register to a VR64 register.
      if (X86::VR64RegClass.contains(DestReg))
-      return X86::MOV64toSDrr;
+      return X86::MMX_MOVD64to64rr;
    }
  
    // SrcReg(FR32) -> DestReg(GR32)
@@ -3554,34 +3903,59 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
      return;
    }
  
-  // Moving EFLAGS to / from another register requires a push and a pop.
-  // Notice that we have to adjust the stack if we don't want to clobber the
-  // first frame index. See X86FrameLowering.cpp - clobbersTheStack.
-  if (SrcReg == X86::EFLAGS) {
-    if (X86::GR64RegClass.contains(DestReg)) {
-      BuildMI(MBB, MI, DL, get(X86::PUSHF64));
-      BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg);
-      return;
-    }
-    if (X86::GR32RegClass.contains(DestReg)) {
-      BuildMI(MBB, MI, DL, get(X86::PUSHF32));
-      BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg);
-      return;
+  bool FromEFLAGS = SrcReg == X86::EFLAGS;
+  bool ToEFLAGS = DestReg == X86::EFLAGS;
+  int Reg = FromEFLAGS ? DestReg : SrcReg;
+  bool is32 = X86::GR32RegClass.contains(Reg);
+  bool is64 = X86::GR64RegClass.contains(Reg);
+  if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) {
+    // The flags need to be saved, but saving EFLAGS with PUSHF/POPF is
+    // inefficient. Instead:
+    //   - Save the overflow flag OF into AL using SETO, and restore it using a
+    //     signed 8-bit addition of AL and INT8_MAX.
+    //   - Save/restore the bottom 8 EFLAGS bits (CF, PF, AF, ZF, SF) to/from AH
+    //     using LAHF/SAHF.
+    //   - When RAX/EAX is live and isn't the destination register, make sure it
+    //     isn't clobbered by PUSH/POP'ing it before and after saving/restoring
+    //     the flags.
+    // This approach is ~2.25x faster than using PUSHF/POPF.
+    //
+    // This is still somewhat inefficient because we don't know which flags are
+    // actually live inside EFLAGS. Were we able to do a single SETcc instead of
+    // SETO+LAHF / ADDB+SAHF the code could be 1.02x faster.
+    //
+    // PUSHF/POPF is also potentially incorrect because it affects other flags
+    // such as TF/IF/DF, which LLVM doesn't model.
+    //
+    // Notice that we have to adjust the stack if we don't want to clobber the
+    // first frame index. See X86FrameLowering.cpp - clobbersTheStack.
+
+    int Mov = is64 ? X86::MOV64rr : X86::MOV32rr;
+    int Push = is64 ? X86::PUSH64r : X86::PUSH32r;
+    int Pop = is64 ? X86::POP64r : X86::POP32r;
+    int AX = is64 ? X86::RAX : X86::EAX;
+
+    bool AXDead = (Reg == AX) ||
+                  (MachineBasicBlock::LQR_Dead ==
+                   MBB.computeRegisterLiveness(&getRegisterInfo(), AX, MI));
+
+    if (!AXDead)
+      BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true));
+    if (FromEFLAGS) {
+      BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL);
+      BuildMI(MBB, MI, DL, get(X86::LAHF));
+      BuildMI(MBB, MI, DL, get(Mov), Reg).addReg(AX);
      }
-  }
-  if (DestReg == X86::EFLAGS) {
-    if (X86::GR64RegClass.contains(SrcReg)) {
-      BuildMI(MBB, MI, DL, get(X86::PUSH64r))
-        .addReg(SrcReg, getKillRegState(KillSrc));
-      BuildMI(MBB, MI, DL, get(X86::POPF64));
-      return;
-    }
-    if (X86::GR32RegClass.contains(SrcReg)) {
-      BuildMI(MBB, MI, DL, get(X86::PUSH32r))
-        .addReg(SrcReg, getKillRegState(KillSrc));
-      BuildMI(MBB, MI, DL, get(X86::POPF32));
-      return;
+    if (ToEFLAGS) {
+      BuildMI(MBB, MI, DL, get(Mov), AX).addReg(Reg, getKillRegState(KillSrc));
+      BuildMI(MBB, MI, DL, get(X86::ADD8ri), X86::AL)
+          .addReg(X86::AL)
+          .addImm(INT8_MAX);
+      BuildMI(MBB, MI, DL, get(X86::SAHF));
      }
+    if (!AXDead)
+      BuildMI(MBB, MI, DL, get(Pop), AX);
+    return;
    }
  
    DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg)
@@ -3676,6 +4050,36 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
    }
  }
  
+bool X86InstrInfo::getMemOpBaseRegImmOfs(MachineInstr *MemOp, unsigned &BaseReg,
+                                         unsigned &Offset,
+                                         const TargetRegisterInfo *TRI) const {
+  const MCInstrDesc &Desc = MemOp->getDesc();
+  int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags, MemOp->getOpcode());
+  if (MemRefBegin < 0)
+    return false;
+
+  MemRefBegin += X86II::getOperandBias(Desc);
+
+  BaseReg = MemOp->getOperand(MemRefBegin + X86::AddrBaseReg).getReg();
+  if (MemOp->getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
+    return false;
+
+  if (MemOp->getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
+      X86::NoRegister)
+    return false;
+
+  const MachineOperand &DispMO = MemOp->getOperand(MemRefBegin + X86::AddrDisp);
+
+  // Displacement can be symbolic
+  if (!DispMO.isImm())
+    return false;
+
+  Offset = DispMO.getImm();
+
+  return (MemOp->getOperand(MemRefBegin + X86::AddrIndexReg).getReg() ==
+          X86::NoRegister);
+}
+
  static unsigned getStoreRegOpcode(unsigned SrcReg,
                                    const TargetRegisterClass *RC,
                                    bool isStackAligned,
@@ -3700,11 +4104,9 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
    assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() &&
           "Stack slot too small for store");
    unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
-  bool isAligned = (MF.getTarget()
-                        .getSubtargetImpl()
-                        ->getFrameLowering()
-                        ->getStackAlignment() >= Alignment) ||
-                   RI.canRealignStack(MF);
+  bool isAligned =
+      (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
+      RI.canRealignStack(MF);
    unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
    DebugLoc DL = MBB.findDebugLoc(MI);
    addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx)
@@ -3739,11 +4141,9 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                          const TargetRegisterInfo *TRI) const {
    const MachineFunction &MF = *MBB.getParent();
    unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
-  bool isAligned = (MF.getTarget()
-                        .getSubtargetImpl()
-                        ->getFrameLowering()
-                        ->getStackAlignment() >= Alignment) ||
-                   RI.canRealignStack(MF);
+  bool isAligned =
+      (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
+      RI.canRealignStack(MF);
    unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
    DebugLoc DL = MBB.findDebugLoc(MI);
    addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx);
@@ -3839,7 +4239,7 @@ analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2,
    return false;
  }
  
-/// isRedundantFlagInstr - check whether the first instruction, whose only
+/// Check whether the first instruction, whose only
  /// purpose is to update flags, can be made redundant.
  /// CMPrr can be made redundant by SUBrr if the operands are the same.
  /// This function can be extended later on.
@@ -3882,7 +4282,7 @@ inline static bool isRedundantFlagInstr(MachineInstr *FlagI, unsigned SrcReg,
    return false;
  }
  
-/// isDefConvertible - check whether the definition can be converted
+/// Check whether the definition can be converted
  /// to remove a comparison against zero.
  inline static bool isDefConvertible(MachineInstr *MI) {
    switch (MI->getOpcode()) {
@@ -3968,8 +4368,7 @@ inline static bool isDefConvertible(MachineInstr *MI) {
    }
  }
  
-/// isUseDefConvertible - check whether the use can be converted
-/// to remove a comparison against zero.
+/// Check whether the use can be converted to remove a comparison against zero.
  static X86::CondCode isUseDefConvertible(MachineInstr *MI) {
    switch (MI->getOpcode()) {
    default: return X86::COND_INVALID;
@@ -3988,7 +4387,7 @@ static X86::CondCode isUseDefConvertible(MachineInstr *MI) {
    }
  }
  
-/// optimizeCompareInstr - Check if there exists an earlier instruction that
+/// Check if there exists an earlier instruction that
  /// operates on the same source operands and sets flags in the same way as
  /// Compare; remove Compare if possible.
  bool X86InstrInfo::
@@ -4279,7 +4678,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
    return true;
  }
  
-/// optimizeLoadInstr - Try to remove the load by folding it to a register
+/// Try to remove the load by folding it to a register
  /// operand at the use. We fold the load instructions if load defines a virtual
  /// register, the virtual register is used once in the same BB, and the
  /// instructions in-between do not load or store, and have no side effects.
@@ -4299,7 +4698,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI,
    DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
    assert(DefMI);
    bool SawStore = false;
-  if (!DefMI->isSafeToMove(this, nullptr, SawStore))
+  if (!DefMI->isSafeToMove(nullptr, SawStore))
      return nullptr;
  
    // Collect information about virtual register operands of MI.
@@ -4323,9 +4722,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI,
      return nullptr;
  
    // Check whether we can fold the def into SrcOperandId.
-  SmallVector<unsigned, 8> Ops;
-  Ops.push_back(SrcOperandId);
-  MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI);
+  MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, DefMI);
    if (FoldMI) {
      FoldAsLoadDefReg = 0;
      return FoldMI;
@@ -4334,9 +4731,9 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI,
    return nullptr;
  }
  
-/// Expand2AddrUndef - Expand a single-def pseudo instruction to a two-addr
-/// instruction with two undef reads of the register being defined.  This is
-/// used for mapping:
+/// Expand a single-def pseudo instruction to a two-addr
+/// instruction with two undef reads of the register being defined.
+/// This is used for mapping:
  ///   %xmm4 = V_SET0
  /// to:
  ///   %xmm4 = PXORrr %xmm4<undef>, %xmm4<undef>
@@ -4366,8 +4763,8 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB,
    const GlobalValue *GV =
        cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
    unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
-  MachineMemOperand *MMO = MBB.getParent()->
-      getMachineMemOperand(MachinePointerInfo::getGOT(), Flag, 8, 8);
+  MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
+      MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 8, 8);
    MachineBasicBlock::iterator I = MIB.getInstr();
  
    BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
@@ -4419,8 +4816,17 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
    return false;
  }
  
+static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs) {
+  unsigned NumAddrOps = MOs.size();
+  for (unsigned i = 0; i != NumAddrOps; ++i)
+    MIB.addOperand(MOs[i]);
+  if (NumAddrOps < 4) // FrameIndex only
+    addOffset(MIB, 0);
+}
+
  static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
-                                     const SmallVectorImpl<MachineOperand> &MOs,
+                                     ArrayRef<MachineOperand> MOs,
+                                     MachineBasicBlock::iterator InsertPt,
                                       MachineInstr *MI,
                                       const TargetInstrInfo &TII) {
    // Create the base instruction with the memory operand as the first part.
@@ -4428,11 +4834,7 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
    MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
                                                MI->getDebugLoc(), true);
    MachineInstrBuilder MIB(MF, NewMI);
-  unsigned NumAddrOps = MOs.size();
-  for (unsigned i = 0; i != NumAddrOps; ++i)
-    MIB.addOperand(MOs[i]);
-  if (NumAddrOps < 4)  // FrameIndex only
-    addOffset(MIB, 0);
+  addOperands(MIB, MOs);
  
    // Loop over the rest of the ri operands, converting them over.
    unsigned NumOps = MI->getDesc().getNumOperands()-2;
@@ -4444,12 +4846,16 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
      MachineOperand &MO = MI->getOperand(i);
      MIB.addOperand(MO);
    }
+
+  MachineBasicBlock *MBB = InsertPt->getParent();
+  MBB->insert(InsertPt, NewMI);
+
    return MIB;
  }
  
-static MachineInstr *FuseInst(MachineFunction &MF,
-                              unsigned Opcode, unsigned OpNo,
-                              const SmallVectorImpl<MachineOperand> &MOs,
+static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
+                              unsigned OpNo, ArrayRef<MachineOperand> MOs,
+                              MachineBasicBlock::iterator InsertPt,
                                MachineInstr *MI, const TargetInstrInfo &TII) {
    // Omit the implicit operands, something BuildMI can't do.
    MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
@@ -4460,49 +4866,45 @@ static MachineInstr *FuseInst(MachineFunction &MF,
      MachineOperand &MO = MI->getOperand(i);
      if (i == OpNo) {
        assert(MO.isReg() && "Expected to fold into reg operand!");
-      unsigned NumAddrOps = MOs.size();
-      for (unsigned i = 0; i != NumAddrOps; ++i)
-        MIB.addOperand(MOs[i]);
-      if (NumAddrOps < 4)  // FrameIndex only
-        addOffset(MIB, 0);
+      addOperands(MIB, MOs);
      } else {
        MIB.addOperand(MO);
      }
    }
+
+  MachineBasicBlock *MBB = InsertPt->getParent();
+  MBB->insert(InsertPt, NewMI);
+
    return MIB;
  }
  
  static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
-                                const SmallVectorImpl<MachineOperand> &MOs,
+                                ArrayRef<MachineOperand> MOs,
+                                MachineBasicBlock::iterator InsertPt,
                                  MachineInstr *MI) {
-  MachineFunction &MF = *MI->getParent()->getParent();
-  MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(Opcode));
-
-  unsigned NumAddrOps = MOs.size();
-  for (unsigned i = 0; i != NumAddrOps; ++i)
-    MIB.addOperand(MOs[i]);
-  if (NumAddrOps < 4)  // FrameIndex only
-    addOffset(MIB, 0);
+  MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
+                                    MI->getDebugLoc(), TII.get(Opcode));
+  addOperands(MIB, MOs);
    return MIB.addImm(0);
  }
  
-MachineInstr*
-X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
-                                    MachineInstr *MI, unsigned i,
-                                    const SmallVectorImpl<MachineOperand> &MOs,
-                                    unsigned Size, unsigned Align,
-                                    bool AllowCommute) const {
+MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr *MI, unsigned OpNum,
+    ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
+    unsigned Size, unsigned Align, bool AllowCommute) const {
    const DenseMap<unsigned,
                   std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
    bool isCallRegIndirect = Subtarget.callRegIndirect();
    bool isTwoAddrFold = false;
  
-  // Atom favors register form of call. So, we do not fold loads into calls
-  // when X86Subtarget is Atom.
-  if (isCallRegIndirect &&
-    (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r)) {
+  // For CPUs that favor the register form of a call or push,
+  // do not fold loads into calls or pushes, unless optimizing for size
+  // aggressively.
+  if (isCallRegIndirect && !MF.getFunction()->optForMinSize() &&
+      (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r ||
+       MI->getOpcode() == X86::PUSH16r || MI->getOpcode() == X86::PUSH32r ||
+       MI->getOpcode() == X86::PUSH64r))
      return nullptr;
-  }
  
    unsigned NumOps = MI->getDesc().getNumOperands();
    bool isTwoAddr = NumOps > 1 &&
@@ -4518,27 +4920,27 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
    // Folding a memory location into the two-address part of a two-address
    // instruction is different than folding it other places.  It requires
    // replacing the *two* registers with the memory location.
-  if (isTwoAddr && NumOps >= 2 && i < 2 &&
+  if (isTwoAddr && NumOps >= 2 && OpNum < 2 &&
        MI->getOperand(0).isReg() &&
        MI->getOperand(1).isReg() &&
        MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) {
      OpcodeTablePtr = &RegOp2MemOpTable2Addr;
      isTwoAddrFold = true;
-  } else if (i == 0) { // If operand 0
+  } else if (OpNum == 0) {
      if (MI->getOpcode() == X86::MOV32r0) {
-      NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI);
+      NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI);
        if (NewMI)
          return NewMI;
      }
  
      OpcodeTablePtr = &RegOp2MemOpTable0;
-  } else if (i == 1) {
+  } else if (OpNum == 1) {
      OpcodeTablePtr = &RegOp2MemOpTable1;
-  } else if (i == 2) {
+  } else if (OpNum == 2) {
      OpcodeTablePtr = &RegOp2MemOpTable2;
-  } else if (i == 3) {
+  } else if (OpNum == 3) {
      OpcodeTablePtr = &RegOp2MemOpTable3;
-  } else if (i == 4) {
+  } else if (OpNum == 4) {
      OpcodeTablePtr = &RegOp2MemOpTable4;
    }
  
@@ -4554,7 +4956,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
          return nullptr;
        bool NarrowToMOV32rm = false;
        if (Size) {
-        unsigned RCSize = getRegClass(MI->getDesc(), i, &RI, MF)->getSize();
+        unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize();
          if (Size < RCSize) {
            // Check if it's safe to fold the load. If the size of the object is
            // narrower than the load width, then it's not.
@@ -4571,9 +4973,9 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
        }
  
        if (isTwoAddrFold)
-        NewMI = FuseTwoAddrInst(MF, Opcode, MOs, MI, *this);
+        NewMI = FuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this);
        else
-        NewMI = FuseInst(MF, Opcode, i, MOs, MI, *this);
+        NewMI = FuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this);
  
        if (NarrowToMOV32rm) {
          // If this is the special case where we use a MOV32rm to load a 32-bit
@@ -4592,7 +4994,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
    // If the instruction and target operand are commutable, commute the
    // instruction and try again.
    if (AllowCommute) {
-    unsigned OriginalOpIdx = i, CommuteOpIdx1, CommuteOpIdx2;
+    unsigned OriginalOpIdx = OpNum, CommuteOpIdx1, CommuteOpIdx2;
      if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
        bool HasDef = MI->getDesc().getNumDefs();
        unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;
@@ -4625,8 +5027,9 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
          // Attempt to fold with the commuted version of the instruction.
          unsigned CommuteOp =
              (CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1);
-        NewMI = foldMemoryOperandImpl(MF, MI, CommuteOp, MOs, Size, Align,
-                                      /*AllowCommute=*/false);
+        NewMI =
+            foldMemoryOperandImpl(MF, MI, CommuteOp, MOs, InsertPt, Size, Align,
+                                  /*AllowCommute=*/false);
          if (NewMI)
            return NewMI;
  
@@ -4650,11 +5053,11 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
  
    // No fusion
    if (PrintFailedFusing && !MI->isCopy())
-    dbgs() << "We failed to fuse operand " << i << " in " << *MI;
+    dbgs() << "We failed to fuse operand " << OpNum << " in " << *MI;
    return nullptr;
  }
  
-/// hasPartialRegUpdate - Return true for all instructions that only update
+/// Return true for all instructions that only update
  /// the first 32 or 64-bits of the destination register and leave the rest
  /// unmodified. This can be used to avoid folding loads if the instructions
  /// only update part of the destination register, and the non-updated part is
@@ -4716,7 +5119,7 @@ static bool hasPartialRegUpdate(unsigned Opcode) {
    return false;
  }
  
-/// getPartialRegUpdateClearance - Inform the ExeDepsFix pass how many idle
+/// Inform the ExeDepsFix pass how many idle
  /// instructions we would like before a partial register update.
  unsigned X86InstrInfo::
  getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
@@ -4855,18 +5258,15 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
    MI->addRegisterKilled(Reg, TRI, true);
  }
  
-MachineInstr*
-X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                                    const SmallVectorImpl<unsigned> &Ops,
-                                    int FrameIndex) const {
+MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, int FrameIndex) const {
    // Check switch flag
    if (NoFusing) return nullptr;
  
    // Unless optimizing for size, don't fold to avoid partial
    // register update stalls
-  if (!MF.getFunction()->getAttributes().
-        hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
-      hasPartialRegUpdate(MI->getOpcode()))
+  if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode()))
      return nullptr;
  
    const MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -4875,10 +5275,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
    // If the function stack isn't realigned we don't want to fold instructions
    // that need increased alignment.
    if (!RI.needsStackRealignment(MF))
-    Alignment = std::min(Alignment, MF.getTarget()
-                                        .getSubtargetImpl()
-                                        ->getFrameLowering()
-                                        ->getStackAlignment());
+    Alignment =
+        std::min(Alignment, Subtarget.getFrameLowering()->getStackAlignment());
    if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
      unsigned NewOpc = 0;
      unsigned RCSize = 0;
@@ -4899,52 +5297,83 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
    } else if (Ops.size() != 1)
      return nullptr;
  
-  SmallVector<MachineOperand,4> MOs;
-  MOs.push_back(MachineOperand::CreateFI(FrameIndex));
-  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs,
+  return foldMemoryOperandImpl(MF, MI, Ops[0],
+                               MachineOperand::CreateFI(FrameIndex), InsertPt,
                                 Size, Alignment, /*AllowCommute=*/true);
  }
  
-static bool isPartialRegisterLoad(const MachineInstr &LoadMI,
-                                  const MachineFunction &MF) {
+/// Check if \p LoadMI is a partial register load that we can't fold into \p MI
+/// because the latter uses contents that wouldn't be defined in the folded
+/// version.  For instance, this transformation isn't legal:
+///   movss (%rdi), %xmm0
+///   addps %xmm0, %xmm0
+/// ->
+///   addps (%rdi), %xmm0
+///
+/// But this one is:
+///   movss (%rdi), %xmm0
+///   addss %xmm0, %xmm0
+/// ->
+///   addss (%rdi), %xmm0
+///
+static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
+                                             const MachineInstr &UserMI,
+                                             const MachineFunction &MF) {
    unsigned Opc = LoadMI.getOpcode();
+  unsigned UserOpc = UserMI.getOpcode();
    unsigned RegSize =
        MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize();
  
-  if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4)
+  if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4) {
      // These instructions only load 32 bits, we can't fold them if the
-    // destination register is wider than 32 bits (4 bytes).
-    return true;
+    // destination register is wider than 32 bits (4 bytes), and its user
+    // instruction isn't scalar (SS).
+    switch (UserOpc) {
+    case X86::ADDSSrr_Int: case X86::VADDSSrr_Int:
+    case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int:
+    case X86::MULSSrr_Int: case X86::VMULSSrr_Int:
+    case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int:
+      return false;
+    default:
+      return true;
+    }
+  }
  
-  if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8)
+  if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8) {
      // These instructions only load 64 bits, we can't fold them if the
-    // destination register is wider than 64 bits (8 bytes).
-    return true;
+    // destination register is wider than 64 bits (8 bytes), and its user
+    // instruction isn't scalar (SD).
+    switch (UserOpc) {
+    case X86::ADDSDrr_Int: case X86::VADDSDrr_Int:
+    case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int:
+    case X86::MULSDrr_Int: case X86::VMULSDrr_Int:
+    case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int:
+      return false;
+    default:
+      return true;
+    }
+  }
  
    return false;
  }
  
-MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
-                                                  MachineInstr *MI,
-                                           const SmallVectorImpl<unsigned> &Ops,
-                                                  MachineInstr *LoadMI) const {
+MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const {
    // If loading from a FrameIndex, fold directly from the FrameIndex.
    unsigned NumOps = LoadMI->getDesc().getNumOperands();
    int FrameIndex;
    if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
-    if (isPartialRegisterLoad(*LoadMI, MF))
+    if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF))
        return nullptr;
-    return foldMemoryOperandImpl(MF, MI, Ops, FrameIndex);
+    return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex);
    }
  
    // Check switch flag
    if (NoFusing) return nullptr;
  
-  // Unless optimizing for size, don't fold to avoid partial
-  // register update stalls
-  if (!MF.getFunction()->getAttributes().
-        hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
-      hasPartialRegUpdate(MI->getOpcode()))
+  // Avoid partial register update stalls unless optimizing for size.
+  if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode()))
      return nullptr;
  
    // Determine the alignment of the load.
@@ -5046,76 +5475,19 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
      break;
    }
    default: {
-    if (isPartialRegisterLoad(*LoadMI, MF))
+    if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF))
        return nullptr;
  
      // Folding a normal load. Just copy the load's address operands.
-    for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
-      MOs.push_back(LoadMI->getOperand(i));
+    MOs.append(LoadMI->operands_begin() + NumOps - X86::AddrNumOperands,
+               LoadMI->operands_begin() + NumOps);
      break;
    }
    }
-  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs,
+  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, InsertPt,
                                 /*Size=*/0, Alignment, /*AllowCommute=*/true);
  }
  
-
-bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
-                                  const SmallVectorImpl<unsigned> &Ops) const {
-  // Check switch flag
-  if (NoFusing) return 0;
-
-  if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
-    switch (MI->getOpcode()) {
-    default: return false;
-    case X86::TEST8rr:
-    case X86::TEST16rr:
-    case X86::TEST32rr:
-    case X86::TEST64rr:
-      return true;
-    case X86::ADD32ri:
-      // FIXME: AsmPrinter doesn't know how to handle
-      // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
-      if (MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
-        return false;
-      break;
-    }
-  }
-
-  if (Ops.size() != 1)
-    return false;
-
-  unsigned OpNum = Ops[0];
-  unsigned Opc = MI->getOpcode();
-  unsigned NumOps = MI->getDesc().getNumOperands();
-  bool isTwoAddr = NumOps > 1 &&
-    MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
-
-  // Folding a memory location into the two-address part of a two-address
-  // instruction is different than folding it other places.  It requires
-  // replacing the *two* registers with the memory location.
-  const DenseMap<unsigned,
-                 std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
-  if (isTwoAddr && NumOps >= 2 && OpNum < 2) {
-    OpcodeTablePtr = &RegOp2MemOpTable2Addr;
-  } else if (OpNum == 0) { // If operand 0
-    if (Opc == X86::MOV32r0)
-      return true;
-
-    OpcodeTablePtr = &RegOp2MemOpTable0;
-  } else if (OpNum == 1) {
-    OpcodeTablePtr = &RegOp2MemOpTable1;
-  } else if (OpNum == 2) {
-    OpcodeTablePtr = &RegOp2MemOpTable2;
-  } else if (OpNum == 3) {
-    OpcodeTablePtr = &RegOp2MemOpTable3;
-  }
-
-  if (OpcodeTablePtr && OpcodeTablePtr->count(Opc))
-    return true;
-  return TargetInstrInfo::canFoldMemoryOperand(MI, Ops);
-}
-
  bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
                                  unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
                                  SmallVectorImpl<MachineInstr*> &NewMIs) const {
@@ -5314,7 +5686,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
    }
    if (Load)
      BeforeOps.push_back(SDValue(Load, 0));
-  std::copy(AfterOps.begin(), AfterOps.end(), std::back_inserter(BeforeOps));
+  BeforeOps.insert(BeforeOps.end(), AfterOps.begin(), AfterOps.end());
    SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, BeforeOps);
    NewNodes.push_back(NewNode);
  
@@ -5341,7 +5713,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
      NewNodes.push_back(Store);
  
      // Preserve memory reference information.
-    cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second);
+    cast<MachineSDNode>(Store)->setMemRefs(MMOs.first, MMOs.second);
    }
  
    return true;
@@ -5696,7 +6068,7 @@ isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
             RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass);
  }
  
-/// getGlobalBaseReg - Return a virtual register initialized with the
+/// Return a virtual register initialized with the
  /// the global base register value. Output instructions required to
  /// initialize the register in the function entry block, if necessary.
  ///
@@ -5729,6 +6101,7 @@ static const uint16_t ReplaceableInstrs[][3] = {
    { X86::MOVAPSrr,   X86::MOVAPDrr,  X86::MOVDQArr  },
    { X86::MOVUPSmr,   X86::MOVUPDmr,  X86::MOVDQUmr  },
    { X86::MOVUPSrm,   X86::MOVUPDrm,  X86::MOVDQUrm  },
+  { X86::MOVLPSmr,   X86::MOVLPDmr,  X86::MOVPQI2QImr  },
    { X86::MOVNTPSmr,  X86::MOVNTPDmr, X86::MOVNTDQmr },
    { X86::ANDNPSrm,   X86::ANDNPDrm,  X86::PANDNrm   },
    { X86::ANDNPSrr,   X86::ANDNPDrr,  X86::PANDNrr   },
@@ -5744,6 +6117,7 @@ static const uint16_t ReplaceableInstrs[][3] = {
    { X86::VMOVAPSrr,  X86::VMOVAPDrr,  X86::VMOVDQArr  },
    { X86::VMOVUPSmr,  X86::VMOVUPDmr,  X86::VMOVDQUmr  },
    { X86::VMOVUPSrm,  X86::VMOVUPDrm,  X86::VMOVDQUrm  },
+  { X86::VMOVLPSmr,  X86::VMOVLPDmr,  X86::VMOVPQI2QImr  },
    { X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr },
    { X86::VANDNPSrm,  X86::VANDNPDrm,  X86::VPANDNrm   },
    { X86::VANDNPSrr,  X86::VANDNPDrr,  X86::VPANDNrr   },
@@ -5829,7 +6203,7 @@ void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
    MI->setDesc(get(table[Domain-1]));
  }
  
-/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+/// Return the noop instruction to use for a noop.
  void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
    NopInst.setOpcode(X86::NOOP);
  }
@@ -5841,7 +6215,7 @@ void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
  void X86InstrInfo::getUnconditionalBranch(
      MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const {
    Branch.setOpcode(X86::JMP_1);
-  Branch.addOperand(MCOperand::CreateExpr(BranchTarget));
+  Branch.addOperand(MCOperand::createExpr(BranchTarget));
  }
  
  // This code must remain in sync with getJumpInstrTableEntryBound in this class!
@@ -5938,15 +6312,350 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const {
  }
  
  bool X86InstrInfo::
-hasHighOperandLatency(const InstrItineraryData *ItinData,
+hasHighOperandLatency(const TargetSchedModel &SchedModel,
                        const MachineRegisterInfo *MRI,
                        const MachineInstr *DefMI, unsigned DefIdx,
                        const MachineInstr *UseMI, unsigned UseIdx) const {
    return isHighLatencyDef(DefMI->getOpcode());
  }
  
+static bool hasReassociableOperands(const MachineInstr &Inst,
+                                    const MachineBasicBlock *MBB) {
+  assert((Inst.getNumOperands() == 3 || Inst.getNumOperands() == 4) &&
+         "Reassociation needs binary operators");
+  const MachineOperand &Op1 = Inst.getOperand(1);
+  const MachineOperand &Op2 = Inst.getOperand(2);
+  const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+  // Integer binary math/logic instructions have a third source operand:
+  // the EFLAGS register. That operand must be both defined here and never
+  // used; ie, it must be dead. If the EFLAGS operand is live, then we can
+  // not change anything because rearranging the operands could affect other
+  // instructions that depend on the exact status flags (zero, sign, etc.)
+  // that are set by using these particular operands with this operation.
+  if (Inst.getNumOperands() == 4) {
+    assert(Inst.getOperand(3).isReg() &&
+           Inst.getOperand(3).getReg() == X86::EFLAGS &&
+           "Unexpected operand in reassociable instruction");
+    if (!Inst.getOperand(3).isDead())
+      return false;
+  }
+  
+  // We need virtual register definitions for the operands that we will
+  // reassociate.
+  MachineInstr *MI1 = nullptr;
+  MachineInstr *MI2 = nullptr;
+  if (Op1.isReg() && TargetRegisterInfo::isVirtualRegister(Op1.getReg()))
+    MI1 = MRI.getUniqueVRegDef(Op1.getReg());
+  if (Op2.isReg() && TargetRegisterInfo::isVirtualRegister(Op2.getReg()))
+    MI2 = MRI.getUniqueVRegDef(Op2.getReg());
+
+  // And they need to be in the trace (otherwise, they won't have a depth).
+  if (MI1 && MI2 && MI1->getParent() == MBB && MI2->getParent() == MBB)
+    return true;
+
+  return false;
+}
+
+static bool hasReassociableSibling(const MachineInstr &Inst, bool &Commuted) {
+  const MachineBasicBlock *MBB = Inst.getParent();
+  const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  MachineInstr *MI1 = MRI.getUniqueVRegDef(Inst.getOperand(1).getReg());
+  MachineInstr *MI2 = MRI.getUniqueVRegDef(Inst.getOperand(2).getReg());
+  unsigned AssocOpcode = Inst.getOpcode();
+
+  // If only one operand has the same opcode and it's the second source operand,
+  // the operands must be commuted.
+  Commuted = MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode;
+  if (Commuted)
+    std::swap(MI1, MI2);
+
+  // 1. The previous instruction must be the same type as Inst.
+  // 2. The previous instruction must have virtual register definitions for its
+  //    operands in the same basic block as Inst.
+  // 3. The previous instruction's result must only be used by Inst.
+  if (MI1->getOpcode() == AssocOpcode &&
+      hasReassociableOperands(*MI1, MBB) &&
+      MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg()))
+    return true;
+
+  return false;
+}
+
+// TODO: There are many more machine instruction opcodes to match:
+//       1. Other data types (integer, vectors)
+//       2. Other math / logic operations (and, or)
+//       3. Other forms of the same operation (intrinsics and other variants)
+static bool isAssociativeAndCommutative(const MachineInstr &Inst) {
+  switch (Inst.getOpcode()) {
+  case X86::IMUL16rr:
+  case X86::IMUL32rr:
+  case X86::IMUL64rr:
+  // Normal min/max instructions are not commutative because of NaN and signed
+  // zero semantics, but these are. Thus, there's no need to check for global
+  // relaxed math; the instructions themselves have the properties we need.
+  case X86::MINCSSrr:
+  case X86::VMINCSSrr:
+    return true;
+  case X86::ADDPDrr:
+  case X86::ADDPSrr:
+  case X86::ADDSDrr:
+  case X86::ADDSSrr:
+  case X86::MULPDrr:
+  case X86::MULPSrr:
+  case X86::MULSDrr:
+  case X86::MULSSrr:
+  case X86::VADDPDrr:
+  case X86::VADDPSrr:
+  case X86::VADDPDYrr:
+  case X86::VADDPSYrr:
+  case X86::VADDSDrr:
+  case X86::VADDSSrr:
+  case X86::VMULPDrr:
+  case X86::VMULPSrr:
+  case X86::VMULPDYrr:
+  case X86::VMULPSYrr:
+  case X86::VMULSDrr:
+  case X86::VMULSSrr:
+    return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
+  default:
+    return false;
+  }
+}
+
+/// Return true if the input instruction is part of a chain of dependent ops
+/// that are suitable for reassociation, otherwise return false.
+/// If the instruction's operands must be commuted to have a previous
+/// instruction of the same type define the first source operand, Commuted will
+/// be set to true.
+static bool isReassociationCandidate(const MachineInstr &Inst, bool &Commuted) {
+  // 1. The operation must be associative and commutative.
+  // 2. The instruction must have virtual register definitions for its
+  //    operands in the same basic block.
+  // 3. The instruction must have a reassociable sibling.
+  if (isAssociativeAndCommutative(Inst) &&
+      hasReassociableOperands(Inst, Inst.getParent()) &&
+      hasReassociableSibling(Inst, Commuted))
+    return true;
+
+  return false;
+}
+
+// FIXME: This has the potential to be expensive (compile time) while not
+// improving the code at all. Some ways to limit the overhead:
+// 1. Track successful transforms; bail out if hit rate gets too low.
+// 2. Only enable at -O3 or some other non-default optimization level.
+// 3. Pre-screen pattern candidates here: if an operand of the previous
+//    instruction is known to not increase the critical path, then don't match
+//    that pattern.
+bool X86InstrInfo::getMachineCombinerPatterns(MachineInstr &Root,
+        SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Patterns) const {
+  // TODO: There is nothing x86-specific here except the instruction type.
+  // This logic could be hoisted into the machine combiner pass itself.
+
+  // Look for this reassociation pattern:
+  //   B = A op X (Prev)
+  //   C = B op Y (Root)
+
+  bool Commute;
+  if (isReassociationCandidate(Root, Commute)) {
+    // We found a sequence of instructions that may be suitable for a
+    // reassociation of operands to increase ILP. Specify each commutation
+    // possibility for the Prev instruction in the sequence and let the
+    // machine combiner decide if changing the operands is worthwhile.
+    if (Commute) {
+      Patterns.push_back(MachineCombinerPattern::MC_REASSOC_AX_YB);
+      Patterns.push_back(MachineCombinerPattern::MC_REASSOC_XA_YB);
+    } else {
+      Patterns.push_back(MachineCombinerPattern::MC_REASSOC_AX_BY);
+      Patterns.push_back(MachineCombinerPattern::MC_REASSOC_XA_BY);
+    }
+    return true;
+  }
+
+  return false;
+}
+
+/// This is an architecture-specific helper function of reassociateOps.
+/// Set special operand attributes for new instructions after reassociation.
+static void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2,
+                                  MachineInstr &NewMI1, MachineInstr &NewMI2) {
+  // Integer instructions define an implicit EFLAGS source register operand as
+  // the third source (fourth total) operand.
+  if (OldMI1.getNumOperands() != 4 || OldMI2.getNumOperands() != 4)
+    return;
+
+  assert(NewMI1.getNumOperands() == 4 && NewMI2.getNumOperands() == 4 &&
+         "Unexpected instruction type for reassociation");
+  
+  MachineOperand &OldOp1 = OldMI1.getOperand(3);
+  MachineOperand &OldOp2 = OldMI2.getOperand(3);
+  MachineOperand &NewOp1 = NewMI1.getOperand(3);
+  MachineOperand &NewOp2 = NewMI2.getOperand(3);
+
+  assert(OldOp1.isReg() && OldOp1.getReg() == X86::EFLAGS && OldOp1.isDead() &&
+         "Must have dead EFLAGS operand in reassociable instruction");
+  assert(OldOp2.isReg() && OldOp2.getReg() == X86::EFLAGS && OldOp2.isDead() &&
+         "Must have dead EFLAGS operand in reassociable instruction");
+
+  (void)OldOp1;
+  (void)OldOp2;
+
+  assert(NewOp1.isReg() && NewOp1.getReg() == X86::EFLAGS &&
+         "Unexpected operand in reassociable instruction");
+  assert(NewOp2.isReg() && NewOp2.getReg() == X86::EFLAGS &&
+         "Unexpected operand in reassociable instruction");
+
+  // Mark the new EFLAGS operands as dead to be helpful to subsequent iterations
+  // of this pass or other passes. The EFLAGS operands must be dead in these new
+  // instructions because the EFLAGS operands in the original instructions must
+  // be dead in order for reassociation to occur.
+  NewOp1.setIsDead();
+  NewOp2.setIsDead();
+}
+
+/// Attempt the following reassociation to reduce critical path length:
+///   B = A op X (Prev)
+///   C = B op Y (Root)
+///   ===>
+///   B = X op Y
+///   C = A op B
+static void reassociateOps(MachineInstr &Root, MachineInstr &Prev,
+                           MachineCombinerPattern::MC_PATTERN Pattern,
+                           SmallVectorImpl<MachineInstr *> &InsInstrs,
+                           SmallVectorImpl<MachineInstr *> &DelInstrs,
+                           DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {
+  MachineFunction *MF = Root.getParent()->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+  const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI);
+
+  // This array encodes the operand index for each parameter because the
+  // operands may be commuted. Each row corresponds to a pattern value,
+  // and each column specifies the index of A, B, X, Y.
+  unsigned OpIdx[4][4] = {
+    { 1, 1, 2, 2 },
+    { 1, 2, 2, 1 },
+    { 2, 1, 1, 2 },
+    { 2, 2, 1, 1 }
+  };
+
+  MachineOperand &OpA = Prev.getOperand(OpIdx[Pattern][0]);
+  MachineOperand &OpB = Root.getOperand(OpIdx[Pattern][1]);
+  MachineOperand &OpX = Prev.getOperand(OpIdx[Pattern][2]);
+  MachineOperand &OpY = Root.getOperand(OpIdx[Pattern][3]);
+  MachineOperand &OpC = Root.getOperand(0);
+
+  unsigned RegA = OpA.getReg();
+  unsigned RegB = OpB.getReg();
+  unsigned RegX = OpX.getReg();
+  unsigned RegY = OpY.getReg();
+  unsigned RegC = OpC.getReg();
+
+  if (TargetRegisterInfo::isVirtualRegister(RegA))
+    MRI.constrainRegClass(RegA, RC);
+  if (TargetRegisterInfo::isVirtualRegister(RegB))
+    MRI.constrainRegClass(RegB, RC);
+  if (TargetRegisterInfo::isVirtualRegister(RegX))
+    MRI.constrainRegClass(RegX, RC);
+  if (TargetRegisterInfo::isVirtualRegister(RegY))
+    MRI.constrainRegClass(RegY, RC);
+  if (TargetRegisterInfo::isVirtualRegister(RegC))
+    MRI.constrainRegClass(RegC, RC);
+
+  // Create a new virtual register for the result of (X op Y) instead of
+  // recycling RegB because the MachineCombiner's computation of the critical
+  // path requires a new register definition rather than an existing one.
+  unsigned NewVR = MRI.createVirtualRegister(RC);
+  InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+
+  unsigned Opcode = Root.getOpcode();
+  bool KillA = OpA.isKill();
+  bool KillX = OpX.isKill();
+  bool KillY = OpY.isKill();
+
+  // Create new instructions for insertion.
+  MachineInstrBuilder MIB1 =
+    BuildMI(*MF, Prev.getDebugLoc(), TII->get(Opcode), NewVR)
+      .addReg(RegX, getKillRegState(KillX))
+      .addReg(RegY, getKillRegState(KillY));
+  MachineInstrBuilder MIB2 =
+    BuildMI(*MF, Root.getDebugLoc(), TII->get(Opcode), RegC)
+      .addReg(RegA, getKillRegState(KillA))
+      .addReg(NewVR, getKillRegState(true));
+
+  setSpecialOperandAttr(Root, Prev, *MIB1, *MIB2);
+
+  // Record new instructions for insertion and old instructions for deletion.
+  InsInstrs.push_back(MIB1);
+  InsInstrs.push_back(MIB2);
+  DelInstrs.push_back(&Prev);
+  DelInstrs.push_back(&Root);
+}
+
+void X86InstrInfo::genAlternativeCodeSequence(
+    MachineInstr &Root,
+    MachineCombinerPattern::MC_PATTERN Pattern,
+    SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
+    DenseMap<unsigned, unsigned> &InstIdxForVirtReg) const {
+  MachineRegisterInfo &MRI = Root.getParent()->getParent()->getRegInfo();
+
+  // Select the previous instruction in the sequence based on the input pattern.
+  MachineInstr *Prev = nullptr;
+  switch (Pattern) {
+    case MachineCombinerPattern::MC_REASSOC_AX_BY:
+    case MachineCombinerPattern::MC_REASSOC_XA_BY:
+      Prev = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+      break;
+    case MachineCombinerPattern::MC_REASSOC_AX_YB:
+    case MachineCombinerPattern::MC_REASSOC_XA_YB:
+      Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
+  }
+  assert(Prev && "Unknown pattern for machine combiner");
+
+  reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, InstIdxForVirtReg);
+  return;
+}
+
+std::pair<unsigned, unsigned>
+X86InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+  return std::make_pair(TF, 0u);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+  using namespace X86II;
+  static std::pair<unsigned, const char *> TargetFlags[] = {
+      {MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"},
+      {MO_PIC_BASE_OFFSET, "x86-pic-base-offset"},
+      {MO_GOT, "x86-got"},
+      {MO_GOTOFF, "x86-gotoff"},
+      {MO_GOTPCREL, "x86-gotpcrel"},
+      {MO_PLT, "x86-plt"},
+      {MO_TLSGD, "x86-tlsgd"},
+      {MO_TLSLD, "x86-tlsld"},
+      {MO_TLSLDM, "x86-tlsldm"},
+      {MO_GOTTPOFF, "x86-gottpoff"},
+      {MO_INDNTPOFF, "x86-indntpoff"},
+      {MO_TPOFF, "x86-tpoff"},
+      {MO_DTPOFF, "x86-dtpoff"},
+      {MO_NTPOFF, "x86-ntpoff"},
+      {MO_GOTNTPOFF, "x86-gotntpoff"},
+      {MO_DLLIMPORT, "x86-dllimport"},
+      {MO_DARWIN_STUB, "x86-darwin-stub"},
+      {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"},
+      {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"},
+      {MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE, "x86-darwin-hidden-nonlazy-pic-base"},
+      {MO_TLVP, "x86-tlvp"},
+      {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"},
+      {MO_SECREL, "x86-secrel"}};
+  return makeArrayRef(TargetFlags);
+}
+
  namespace {
-  /// CGBR - Create Global Base Reg pass. This initializes the PIC
+  /// Create Global Base Reg pass. This initializes the PIC
    /// global base register for x86-32.
    struct CGBR : public MachineFunctionPass {
      static char ID;
@@ -5955,10 +6664,11 @@ namespace {
      bool runOnMachineFunction(MachineFunction &MF) override {
        const X86TargetMachine *TM =
          static_cast<const X86TargetMachine *>(&MF.getTarget());
+      const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
  
        // Don't do anything if this is 64-bit as 64-bit PIC
        // uses RIP relative addressing.
-      if (TM->getSubtarget<X86Subtarget>().is64Bit())
+      if (STI.is64Bit())
          return false;
  
        // Only emit a global base reg in PIC mode.
@@ -5977,10 +6687,10 @@ namespace {
        MachineBasicBlock::iterator MBBI = FirstMBB.begin();
        DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
        MachineRegisterInfo &RegInfo = MF.getRegInfo();
-      const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo();
+      const X86InstrInfo *TII = STI.getInstrInfo();
  
        unsigned PC;
-      if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT())
+      if (STI.isPICStyleGOT())
          PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
        else
          PC = GlobalBaseReg;
@@ -5991,7 +6701,7 @@ namespace {
  
        // If we're using vanilla 'GOT' PIC style, we should use relative addressing
        // not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
-      if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT()) {
+      if (STI.isPICStyleGOT()) {
          // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], %some_register
          BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
            .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
@@ -6072,10 +6782,9 @@ namespace {
      MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr *I,
                                           unsigned TLSBaseAddrReg) {
        MachineFunction *MF = I->getParent()->getParent();
-      const X86TargetMachine *TM =
-          static_cast<const X86TargetMachine *>(&MF->getTarget());
-      const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit();
-      const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo();
+      const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
+      const bool is64Bit = STI.is64Bit();
+      const X86InstrInfo *TII = STI.getInstrInfo();
  
        // Insert a Copy from TLSBaseAddrReg to RAX/EAX.
        MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
@@ -6093,10 +6802,9 @@ namespace {
      // inserting a copy instruction after I. Returns the new instruction.
      MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
        MachineFunction *MF = I->getParent()->getParent();
-      const X86TargetMachine *TM =
-          static_cast<const X86TargetMachine *>(&MF->getTarget());
-      const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit();
-      const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo();
+      const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
+      const bool is64Bit = STI.is64Bit();
+      const X86InstrInfo *TII = STI.getInstrInfo();
  
        // Create a virtual register for the TLS base address.
        MachineRegisterInfo &RegInfo = MF->getRegInfo();