[X86][SSE] Vector integer/float conversion memory folding
[oota-llvm.git] / lib / Target / X86 / X86InstrInfo.cpp
index da011cb78924e541a16ededc36c6cb270978f6e2..b5cbee566f2409276b87ea8ba1989aa3397929db 100644 (file)
@@ -566,6 +566,13 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMOVUPSYrr,      X86::VMOVUPSYrm,          0 },
     { X86::VPERMILPDYri,    X86::VPERMILPDYmi,        0 },
     { X86::VPERMILPSYri,    X86::VPERMILPSYmi,        0 },
+    { X86::VRCPPSYr,        X86::VRCPPSYm,            0 },
+    { X86::VRCPPSYr_Int,    X86::VRCPPSYm_Int,        0 },
+    { X86::VRSQRTPSYr,      X86::VRSQRTPSYm,          0 },
+    { X86::VSQRTPDYr,       X86::VSQRTPDYm,           0 },
+    { X86::VSQRTPSYr,       X86::VSQRTPSYm,           0 },
+    { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm,     TB_NO_REVERSE },
+    { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm,     TB_NO_REVERSE },
 
     // AVX2 foldable instructions
     { X86::VPABSBrr256,     X86::VPABSBrm256,         0 },
@@ -574,13 +581,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSHUFDYri,      X86::VPSHUFDYmi,          0 },
     { X86::VPSHUFHWYri,     X86::VPSHUFHWYmi,         0 },
     { X86::VPSHUFLWYri,     X86::VPSHUFLWYmi,         0 },
-    { X86::VRCPPSYr,        X86::VRCPPSYm,            0 },
-    { X86::VRCPPSYr_Int,    X86::VRCPPSYm_Int,        0 },
-    { X86::VRSQRTPSYr,      X86::VRSQRTPSYm,          0 },
-    { X86::VSQRTPDYr,       X86::VSQRTPDYm,           0 },
-    { X86::VSQRTPSYr,       X86::VSQRTPSYm,           0 },
-    { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm,     TB_NO_REVERSE },
-    { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm,     TB_NO_REVERSE },
 
     // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions
     { X86::BEXTR32rr,       X86::BEXTR32rm,           0 },
@@ -3907,10 +3907,10 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
 /// operand at the use. We fold the load instructions if load defines a virtual
 /// register, the virtual register is used once in the same BB, and the
 /// instructions in-between do not load or store, and have no side effects.
-MachineInstr* X86InstrInfo::
-optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI,
-                  unsigned &FoldAsLoadDefReg,
-                  MachineInstr *&DefMI) const {
+MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI,
+                                              const MachineRegisterInfo *MRI,
+                                              unsigned &FoldAsLoadDefReg,
+                                              MachineInstr *&DefMI) const {
   if (FoldAsLoadDefReg == 0)
     return nullptr;
   // To be conservative, if there exists another load, clear the load candidate.
@@ -3943,7 +3943,8 @@ optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI,
     SrcOperandId = i;
     FoundSrcOperand = true;
   }
-  if (!FoundSrcOperand) return nullptr;
+  if (!FoundSrcOperand)
+    return nullptr;
 
   // Check whether we can fold the def into SrcOperandId.
   SmallVector<unsigned, 8> Ops;
@@ -4113,7 +4114,8 @@ MachineInstr*
 X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                     MachineInstr *MI, unsigned i,
                                     const SmallVectorImpl<MachineOperand> &MOs,
-                                    unsigned Size, unsigned Align, bool AllowCommute) const {
+                                    unsigned Size, unsigned Align,
+                                    bool AllowCommute) const {
   const DenseMap<unsigned,
                  std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
   bool isCallRegIndirect = Subtarget.callRegIndirect();
@@ -4181,8 +4183,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
           if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
             return nullptr;
           // If this is a 64-bit load, but the spill slot is 32, then we can do
-          // a 32-bit load which is implicitly zero-extended. This likely is due
-          // to liveintervalanalysis remat'ing a load from stack slot.
+          // a 32-bit load which is implicitly zero-extended. This likely is
+          // due to live interval analysis remat'ing a load from stack slot.
           if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg())
             return nullptr;
           Opcode = X86::MOV32rm;
@@ -4201,8 +4203,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
         // to a 32-bit one.
         unsigned DstReg = NewMI->getOperand(0).getReg();
         if (TargetRegisterInfo::isPhysicalRegister(DstReg))
-          NewMI->getOperand(0).setReg(RI.getSubReg(DstReg,
-                                                   X86::sub_32bit));
+          NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
         else
           NewMI->getOperand(0).setSubReg(X86::sub_32bit);
       }
@@ -4210,12 +4211,29 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     }
   }
 
-  // If the instruction and target operand are commutable, commute the instruction and try again.
+  // If the instruction and target operand are commutable, commute the
+  // instruction and try again.
   if (AllowCommute) {
     unsigned OriginalOpIdx = i, CommuteOpIdx1, CommuteOpIdx2;
     if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
-      if ((CommuteOpIdx1 == OriginalOpIdx) || (CommuteOpIdx2 == OriginalOpIdx)) {
-        MachineInstr* CommutedMI = commuteInstruction(MI, false);
+      bool HasDef = MI->getDesc().getNumDefs();
+      unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;
+      unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg();
+      unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg();
+      bool Tied0 =
+          0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
+      bool Tied1 =
+          0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);
+
+      // If either of the commutable operands are tied to the destination
+      // then we can not commute + fold.
+      if ((HasDef && Reg0 == Reg1 && Tied0) ||
+          (HasDef && Reg0 == Reg2 && Tied1))
+        return nullptr;
+
+      if ((CommuteOpIdx1 == OriginalOpIdx) ||
+          (CommuteOpIdx2 == OriginalOpIdx)) {
+        MachineInstr *CommutedMI = commuteInstruction(MI, false);
         if (!CommutedMI) {
           // Unable to commute.
           return nullptr;
@@ -4227,13 +4245,15 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
         }
 
         // Attempt to fold with the commuted version of the instruction.
-        unsigned CommuteOpIdx = (CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1);
-        NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx, MOs, Size, Align, /*AllowCommute=*/ false);
+        unsigned CommuteOp =
+            (CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1);
+        NewMI = foldMemoryOperandImpl(MF, MI, CommuteOp, MOs, Size, Align,
+                                      /*AllowCommute=*/false);
         if (NewMI)
           return NewMI;
 
         // Folding failed again - undo the commute before returning.
-        MachineInstrUncommutedMI = commuteInstruction(MI, false);
+        MachineInstr *UncommutedMI = commuteInstruction(MI, false);
         if (!UncommutedMI) {
           // Unable to commute.
           return nullptr;
@@ -4459,7 +4479,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
 
   SmallVector<MachineOperand,4> MOs;
   MOs.push_back(MachineOperand::CreateFI(FrameIndex));
-  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, Size, Alignment, /*AllowCommute=*/ true);
+  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs,
+                               Size, Alignment, /*AllowCommute=*/true);
 }
 
 static bool isPartialRegisterLoad(const MachineInstr &LoadMI,
@@ -4612,7 +4633,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     break;
   }
   }
-  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, 0, Alignment, /*AllowCommute=*/ true);
+  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs,
+                               /*Size=*/0, Alignment, /*AllowCommute=*/true);
 }