[X86][SSE] Vector integer/float conversion memory folding

[oota-llvm.git] / lib / Target / X86 / X86InstrInfo.cpp
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp

index da011cb78924e541a16ededc36c6cb270978f6e2..b5cbee566f2409276b87ea8ba1989aa3397929db 100644 (file)
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -566,6 +566,13 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::VMOVUPSYrr,      X86::VMOVUPSYrm,          0 },
      { X86::VPERMILPDYri,    X86::VPERMILPDYmi,        0 },
      { X86::VPERMILPSYri,    X86::VPERMILPSYmi,        0 },
+    { X86::VRCPPSYr,        X86::VRCPPSYm,            0 },
+    { X86::VRCPPSYr_Int,    X86::VRCPPSYm_Int,        0 },
+    { X86::VRSQRTPSYr,      X86::VRSQRTPSYm,          0 },
+    { X86::VSQRTPDYr,       X86::VSQRTPDYm,           0 },
+    { X86::VSQRTPSYr,       X86::VSQRTPSYm,           0 },
+    { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm,     TB_NO_REVERSE },
+    { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm,     TB_NO_REVERSE },
  
      // AVX2 foldable instructions
      { X86::VPABSBrr256,     X86::VPABSBrm256,         0 },
@@ -574,13 +581,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::VPSHUFDYri,      X86::VPSHUFDYmi,          0 },
      { X86::VPSHUFHWYri,     X86::VPSHUFHWYmi,         0 },
      { X86::VPSHUFLWYri,     X86::VPSHUFLWYmi,         0 },
-    { X86::VRCPPSYr,        X86::VRCPPSYm,            0 },
-    { X86::VRCPPSYr_Int,    X86::VRCPPSYm_Int,        0 },
-    { X86::VRSQRTPSYr,      X86::VRSQRTPSYm,          0 },
-    { X86::VSQRTPDYr,       X86::VSQRTPDYm,           0 },
-    { X86::VSQRTPSYr,       X86::VSQRTPSYm,           0 },
-    { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm,     TB_NO_REVERSE },
-    { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm,     TB_NO_REVERSE },
  
      // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions
      { X86::BEXTR32rr,       X86::BEXTR32rm,           0 },
@@ -3907,10 +3907,10 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
  /// operand at the use. We fold the load instructions if load defines a virtual
  /// register, the virtual register is used once in the same BB, and the
  /// instructions in-between do not load or store, and have no side effects.
-MachineInstr* X86InstrInfo::
-optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI,
-                  unsigned &FoldAsLoadDefReg,
-                  MachineInstr *&DefMI) const {
+MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI,
+                                              const MachineRegisterInfo *MRI,
+                                              unsigned &FoldAsLoadDefReg,
+                                              MachineInstr *&DefMI) const {
    if (FoldAsLoadDefReg == 0)
      return nullptr;
    // To be conservative, if there exists another load, clear the load candidate.
@@ -3943,7 +3943,8 @@ optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI,
      SrcOperandId = i;
      FoundSrcOperand = true;
    }
-  if (!FoundSrcOperand) return nullptr;
+  if (!FoundSrcOperand)
+    return nullptr;
  
    // Check whether we can fold the def into SrcOperandId.
    SmallVector<unsigned, 8> Ops;
@@ -4113,7 +4114,8 @@ MachineInstr*
  X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                      MachineInstr *MI, unsigned i,
                                      const SmallVectorImpl<MachineOperand> &MOs,
-                                    unsigned Size, unsigned Align, bool AllowCommute) const {
+                                    unsigned Size, unsigned Align,
+                                    bool AllowCommute) const {
    const DenseMap<unsigned,
                   std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
    bool isCallRegIndirect = Subtarget.callRegIndirect();
@@ -4181,8 +4183,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
            if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
              return nullptr;
            // If this is a 64-bit load, but the spill slot is 32, then we can do
-          // a 32-bit load which is implicitly zero-extended. This likely is due
-          // to liveintervalanalysis remat'ing a load from stack slot.
+          // a 32-bit load which is implicitly zero-extended. This likely is
+          // due to live interval analysis remat'ing a load from stack slot.
            if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg())
              return nullptr;
            Opcode = X86::MOV32rm;
@@ -4201,8 +4203,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
          // to a 32-bit one.
          unsigned DstReg = NewMI->getOperand(0).getReg();
          if (TargetRegisterInfo::isPhysicalRegister(DstReg))
-          NewMI->getOperand(0).setReg(RI.getSubReg(DstReg,
-                                                   X86::sub_32bit));
+          NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
          else
            NewMI->getOperand(0).setSubReg(X86::sub_32bit);
        }
@@ -4210,12 +4211,29 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
      }
    }
  
-  // If the instruction and target operand are commutable, commute the instruction and try again.
+  // If the instruction and target operand are commutable, commute the
+  // instruction and try again.
    if (AllowCommute) {
      unsigned OriginalOpIdx = i, CommuteOpIdx1, CommuteOpIdx2;
      if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
-      if ((CommuteOpIdx1 == OriginalOpIdx) || (CommuteOpIdx2 == OriginalOpIdx)) {
-        MachineInstr* CommutedMI = commuteInstruction(MI, false);
+      bool HasDef = MI->getDesc().getNumDefs();
+      unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;
+      unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg();
+      unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg();
+      bool Tied0 =
+          0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
+      bool Tied1 =
+          0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);
+
+      // If either of the commutable operands are tied to the destination
+      // then we can not commute + fold.
+      if ((HasDef && Reg0 == Reg1 && Tied0) ||
+          (HasDef && Reg0 == Reg2 && Tied1))
+        return nullptr;
+
+      if ((CommuteOpIdx1 == OriginalOpIdx) ||
+          (CommuteOpIdx2 == OriginalOpIdx)) {
+        MachineInstr *CommutedMI = commuteInstruction(MI, false);
          if (!CommutedMI) {
            // Unable to commute.
            return nullptr;
@@ -4227,13 +4245,15 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
          }
  
          // Attempt to fold with the commuted version of the instruction.
-        unsigned CommuteOpIdx = (CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1);
-        NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx, MOs, Size, Align, /*AllowCommute=*/ false);
+        unsigned CommuteOp =
+            (CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1);
+        NewMI = foldMemoryOperandImpl(MF, MI, CommuteOp, MOs, Size, Align,
+                                      /*AllowCommute=*/false);
          if (NewMI)
            return NewMI;
  
          // Folding failed again - undo the commute before returning.
-        MachineInstr* UncommutedMI = commuteInstruction(MI, false);
+        MachineInstr *UncommutedMI = commuteInstruction(MI, false);
          if (!UncommutedMI) {
            // Unable to commute.
            return nullptr;
@@ -4459,7 +4479,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
  
    SmallVector<MachineOperand,4> MOs;
    MOs.push_back(MachineOperand::CreateFI(FrameIndex));
-  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, Size, Alignment, /*AllowCommute=*/ true);
+  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs,
+                               Size, Alignment, /*AllowCommute=*/true);
  }
  
  static bool isPartialRegisterLoad(const MachineInstr &LoadMI,
@@ -4612,7 +4633,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
      break;
    }
    }
-  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, 0, Alignment, /*AllowCommute=*/ true);
+  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs,
+                               /*Size=*/0, Alignment, /*AllowCommute=*/true);
  }