Optimize splat of a scalar load into a shuffle of a vector load when it's legal....

author Evan Cheng <evan.cheng@apple.com>

Wed, 9 Dec 2009 21:00:30 +0000 (21:00 +0000)

committer Evan Cheng <evan.cheng@apple.com>

Wed, 9 Dec 2009 21:00:30 +0000 (21:00 +0000)
author Evan Cheng <evan.cheng@apple.com>
Wed, 9 Dec 2009 21:00:30 +0000 (21:00 +0000)
committer Evan Cheng <evan.cheng@apple.com>
Wed, 9 Dec 2009 21:00:30 +0000 (21:00 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 38e1ffe89244ac9bc5d926bbb803e2d3e456fe99..8284b17272d23149cddc3f575eeb9a02d04683cf 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -3343,6 +3343,82 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
                               DAG.getConstant(NumBits, TLI.getShiftAmountTy())));
  }
  
+SDValue
+X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
+                                          SelectionDAG &DAG) {
+  
+  // Check if the scalar load can be widened into a vector load. And if
+  // the address is "base + cst" see if the cst can be "absorbed" into
+  // the shuffle mask.
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
+    SDValue Ptr = LD->getBasePtr();
+    if (!ISD::isNormalLoad(LD) || LD->isVolatile())
+      return SDValue();
+    EVT PVT = LD->getValueType(0);
+    if (PVT != MVT::i32 && PVT != MVT::f32)
+      return SDValue();
+
+    int FI = -1;
+    int64_t Offset = 0;
+    if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
+      FI = FINode->getIndex();
+      Offset = 0;
+    } else if (Ptr.getOpcode() == ISD::ADD &&
+               isa<ConstantSDNode>(Ptr.getOperand(1)) &&
+               isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
+      FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
+      Offset = Ptr.getConstantOperandVal(1);
+      Ptr = Ptr.getOperand(0);
+    } else {
+      return SDValue();
+    }
+
+    SDValue Chain = LD->getChain();
+    // Make sure the stack object alignment is at least 16.
+    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    if (DAG.InferPtrAlignment(Ptr) < 16) {
+      if (MFI->isFixedObjectIndex(FI)) {
+        // Can't change the alignment. Reference stack + offset explicitly
+        // if stack pointer is at least 16-byte aligned.
+        unsigned StackAlign = Subtarget->getStackAlignment();
+        if (StackAlign < 16)
+          return SDValue();
+        Offset = MFI->getObjectOffset(FI) + Offset;
+        SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr,
+                                              getPointerTy());
+        Ptr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr,
+                          DAG.getConstant(Offset & ~15, getPointerTy()));
+        Offset %= 16;
+      } else {
+        MFI->setObjectAlignment(FI, 16);
+      }
+    }
+
+    // (Offset % 16) must be multiple of 4. Then address is then
+    // Ptr + (Offset & ~15).
+    if (Offset < 0)
+      return SDValue();
+    if ((Offset % 16) & 3)
+      return SDValue();
+    int64_t StartOffset = Offset & ~15;
+    if (StartOffset)
+      Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
+                        Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
+
+    int EltNo = (Offset - StartOffset) >> 2;
+    int Mask[4] = { EltNo, EltNo, EltNo, EltNo };
+    EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32;
+    SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0);
+    // Canonicalize it to a v4i32 shuffle.
+    V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+                       DAG.getVectorShuffle(MVT::v4i32, dl, V1,
+                                            DAG.getUNDEF(MVT::v4i32), &Mask[0]));
+  }
+
+  return SDValue();
+}
+
  SDValue
  X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
    DebugLoc dl = Op.getDebugLoc();
@@ -3486,8 +3562,19 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
    }
  
    // Splat is obviously ok. Let legalizer expand it to a shuffle.
-  if (Values.size() == 1)
+  if (Values.size() == 1) {
+    if (EVTBits == 32) {
+      // Instead of a shuffle like this:
+      // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
+      // Check if it's possible to issue this instead.
+      // shuffle (vload ptr)), undef, <1, 1, 1, 1>
+      unsigned Idx = CountTrailingZeros_32(NonZeros);
+      SDValue Item = Op.getOperand(Idx);
+      if (Op.getNode()->isOnlyUserOf(Item.getNode()))
+        return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
+    }
      return SDValue();
+  }
  
    // A vector full of immediates; various special cases are already
    // handled, so this is best done with a single constant-pool load.
@@ -4278,7 +4365,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
    unsigned ShAmt = 0;
    SDValue ShVal;
    bool isShift = getSubtarget()->hasSSE2() &&
-  isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
+    isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
    if (isShift && ShVal.hasOneUse()) {
      // If the shifted value has multiple uses, it may be cheaper to use
      // v_set0 + movlhps or movhlps, etc.
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h

index 7b4ab62fddd5e2aaaa50773d513554186b70de1c..89b773df5624a81036c529888d9e72bf1269e153 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -626,7 +626,9 @@ namespace llvm {
  
      std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
                                                 bool isSigned);
-    
+
+    SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
+                                   SelectionDAG &DAG);
      SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG);
      SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG);
      SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG);
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td

index dfdd4ce36c6d4209a592eef3bc6c46f313819e4b..62841f8dec2225f6a81a4ebe5e04afdd86dfbf8c 100644 (file)
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -2083,7 +2083,7 @@ def PSHUFDmi : PDIi8<0x70, MRMSrcMem,
                       (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
                       "pshufd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set VR128:$dst, (v4i32 (pshufd:$src2
-                                             (bc_v4i32(memopv2i64 addr:$src1)),
+                                             (bc_v4i32 (memopv2i64 addr:$src1)),
                                               (undef))))]>;
  }
  
diff --git a/test/CodeGen/X86/splat-scalar-load.ll b/test/CodeGen/X86/splat-scalar-load.ll

new file mode 100644 (file)

index 0000000..6c93efa
--- /dev/null
+++ b/test/CodeGen/X86/splat-scalar-load.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
+; rdar://7434544
+
+define <2 x i64> @t1() nounwind ssp {
+entry:
+; CHECK: t1:
+; CHECK: pshufd        $0, (%esp), %xmm0
+  %array = alloca [8 x float], align 16
+  %arrayidx = getelementptr inbounds [8 x float]* %array, i32 0, i32 0
+  %tmp2 = load float* %arrayidx
+  %vecinit = insertelement <4 x float> undef, float %tmp2, i32 0
+  %vecinit5 = insertelement <4 x float> %vecinit, float %tmp2, i32 1
+  %vecinit7 = insertelement <4 x float> %vecinit5, float %tmp2, i32 2
+  %vecinit9 = insertelement <4 x float> %vecinit7, float %tmp2, i32 3
+  %0 = bitcast <4 x float> %vecinit9 to <2 x i64>
+  ret <2 x i64> %0
+}
+
+define <2 x i64> @t2() nounwind ssp {
+entry:
+; CHECK: t2:
+; CHECK: pshufd        $85, (%esp), %xmm0
+  %array = alloca [8 x float], align 4
+  %arrayidx = getelementptr inbounds [8 x float]* %array, i32 0, i32 1
+  %tmp2 = load float* %arrayidx
+  %vecinit = insertelement <4 x float> undef, float %tmp2, i32 0
+  %vecinit5 = insertelement <4 x float> %vecinit, float %tmp2, i32 1
+  %vecinit7 = insertelement <4 x float> %vecinit5, float %tmp2, i32 2
+  %vecinit9 = insertelement <4 x float> %vecinit7, float %tmp2, i32 3
+  %0 = bitcast <4 x float> %vecinit9 to <2 x i64>
+  ret <2 x i64> %0
+}
+
+define <4 x float> @t3(float %tmp1, float %tmp2, float %tmp3) nounwind readnone ssp {
+entry:
+; CHECK: t3:
+; CHECK: pshufd        $-86, (%esp), %xmm0
+  %0 = insertelement <4 x float> undef, float %tmp3, i32 0
+  %1 = insertelement <4 x float> %0, float %tmp3, i32 1
+  %2 = insertelement <4 x float> %1, float %tmp3, i32 2
+  %3 = insertelement <4 x float> %2, float %tmp3, i32 3
+  ret <4 x float> %3
+}
author	Evan Cheng <evan.cheng@apple.com>
	Wed, 9 Dec 2009 21:00:30 +0000 (21:00 +0000)
committer	Evan Cheng <evan.cheng@apple.com>
	Wed, 9 Dec 2009 21:00:30 +0000 (21:00 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
lib/Target/X86/X86ISelLowering.h		patch \| blob \| history
lib/Target/X86/X86InstrSSE.td		patch \| blob \| history
test/CodeGen/X86/splat-scalar-load.ll	[new file with mode: 0644]	patch \| blob