BUILD_VECTOR was missing out on some prime opportunities to use SSE 4.1 inserts.

author Nate Begeman <natebegeman@mac.com>

Wed, 24 Mar 2010 20:49:50 +0000 (20:49 +0000)

committer Nate Begeman <natebegeman@mac.com>

Wed, 24 Mar 2010 20:49:50 +0000 (20:49 +0000)
author Nate Begeman <natebegeman@mac.com>
Wed, 24 Mar 2010 20:49:50 +0000 (20:49 +0000)
committer Nate Begeman <natebegeman@mac.com>
Wed, 24 Mar 2010 20:49:50 +0000 (20:49 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 704f9c65a59ee288fb39b0dd029fba3e68504246..960655806b7e325a5890fb5f9bc7864c8d02ee08 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -3613,6 +3613,54 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
    return SDValue();
  }
  
+static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
+                                        DebugLoc &dl, SelectionDAG &DAG) {
+  EVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = Elts.size();
+  
+  // FIXME: check for zeroes
+  LoadSDNode *LDBase = NULL;
+  unsigned LastLoadedElt = -1U;
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDValue Elt = Elts[i];
+    
+    if (!Elt.getNode() ||
+        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
+      return SDValue();
+    if (!LDBase) {
+      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
+        return SDValue();
+      LDBase = cast<LoadSDNode>(Elt.getNode());
+      LastLoadedElt = i;
+      continue;
+    }
+    if (Elt.getOpcode() == ISD::UNDEF)
+      continue;
+
+    LoadSDNode *LD = cast<LoadSDNode>(Elt);
+    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
+      return SDValue();
+    LastLoadedElt = i;
+  }
+                                       
+  if (LastLoadedElt == NumElems - 1) {
+    if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
+      return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(),
+                         LDBase->getSrcValue(), LDBase->getSrcValueOffset(),
+                         LDBase->isVolatile(), LDBase->isNonTemporal(), 0);
+    return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(),
+                       LDBase->getSrcValue(), LDBase->getSrcValueOffset(),
+                       LDBase->isVolatile(), LDBase->isNonTemporal(),
+                       LDBase->getAlignment());
+  } else if (NumElems == 4 && LastLoadedElt == 1) {
+    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
+    SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
+    SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode);
+  }
+  return SDValue();
+}
+
  SDValue
  X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
    DebugLoc dl = Op.getDebugLoc();
@@ -3841,14 +3889,18 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
      return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
    }
  
-  if (Values.size() > 2) {
-    // If we have SSE 4.1, Expand into a number of inserts unless the number of
-    // values to be inserted is equal to the number of elements, in which case
-    // use the unpack code below in the hopes of matching the consecutive elts
-    // load merge pattern for shuffles.
-    // FIXME: We could probably just check that here directly.
-    if (Values.size() < NumElems && VT.getSizeInBits() == 128 &&
-        getSubtarget()->hasSSE41()) {
+  if (Values.size() > 1 && VT.getSizeInBits() == 128) {
+    // Check for a build vector of consecutive loads.
+    for (unsigned i = 0; i < NumElems; ++i)
+      V[i] = Op.getOperand(i);
+    
+    // Check for elements which are consecutive loads.
+    SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
+    if (LD.getNode())
+      return LD;
+    
+    // For SSE 4.1, use inserts into undef.  
+    if (getSubtarget()->hasSSE41()) {
        V[0] = DAG.getUNDEF(VT);
        for (unsigned i = 0; i < NumElems; ++i)
          if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
@@ -3856,7 +3908,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
                               Op.getOperand(i), DAG.getIntPtrConstant(i));
        return V[0];
      }
-    // Expand into a number of unpckl*.
+    
+    // Otherwise, expand into a number of unpckl*
      // e.g. for v4f32
      //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
      //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
@@ -3871,7 +3924,6 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
      }
      return V[0];
    }
-
    return SDValue();
  }
  
@@ -8797,83 +8849,24 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
    return TargetLowering::isGAPlusOffset(N, GA, Offset);
  }
  
-static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
-                                     EVT EltVT, LoadSDNode *&LDBase,
-                                     unsigned &LastLoadedElt,
-                                     SelectionDAG &DAG, MachineFrameInfo *MFI,
-                                     const TargetLowering &TLI) {
-  LDBase = NULL;
-  LastLoadedElt = -1U;
-  for (unsigned i = 0; i < NumElems; ++i) {
-    if (N->getMaskElt(i) < 0) {
-      if (!LDBase)
-        return false;
-      continue;
-    }
-
-    SDValue Elt = DAG.getShuffleScalarElt(N, i);
-    if (!Elt.getNode() ||
-        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
-      return false;
-    if (!LDBase) {
-      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
-        return false;
-      LDBase = cast<LoadSDNode>(Elt.getNode());
-      LastLoadedElt = i;
-      continue;
-    }
-    if (Elt.getOpcode() == ISD::UNDEF)
-      continue;
-
-    LoadSDNode *LD = cast<LoadSDNode>(Elt);
-    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
-      return false;
-    LastLoadedElt = i;
-  }
-  return true;
-}
-
  /// PerformShuffleCombine - Combine a vector_shuffle that is equal to
  /// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
  /// if the load addresses are consecutive, non-overlapping, and in the right
-/// order.  In the case of v2i64, it will see if it can rewrite the
-/// shuffle to be an appropriate build vector so it can take advantage of
-// performBuildVectorCombine.
+/// order.
  static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
                                       const TargetLowering &TLI) {
    DebugLoc dl = N->getDebugLoc();
    EVT VT = N->getValueType(0);
-  EVT EltVT = VT.getVectorElementType();
    ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
-  unsigned NumElems = VT.getVectorNumElements();
  
    if (VT.getSizeInBits() != 128)
      return SDValue();
  
-  // Try to combine a vector_shuffle into a 128-bit load.
-  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
-  LoadSDNode *LD = NULL;
-  unsigned LastLoadedElt;
-  if (!EltsFromConsecutiveLoads(SVN, NumElems, EltVT, LD, LastLoadedElt, DAG,
-                                MFI, TLI))
-    return SDValue();
-
-  if (LastLoadedElt == NumElems - 1) {
-    if (DAG.InferPtrAlignment(LD->getBasePtr()) >= 16)
-      return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
-                         LD->getSrcValue(), LD->getSrcValueOffset(),
-                         LD->isVolatile(), LD->isNonTemporal(), 0);
-    return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
-                       LD->getSrcValue(), LD->getSrcValueOffset(),
-                       LD->isVolatile(), LD->isNonTemporal(),
-                       LD->getAlignment());
-  } else if (NumElems == 4 && LastLoadedElt == 1) {
-    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
-    SDValue Ops[] = { LD->getChain(), LD->getBasePtr() };
-    SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
-    return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode);
-  }
-  return SDValue();
+  SmallVector<SDValue, 16> Elts;
+  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
+    Elts.push_back(DAG.getShuffleScalarElt(SVN, i));
+  
+  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
  }
  
  /// PerformShuffleCombine - Detect vector gather/scatter index generation
diff --git a/test/CodeGen/X86/vec_insert-4.ll b/test/CodeGen/X86/vec_insert-4.ll

new file mode 100644 (file)

index 0000000..2c31e56
--- /dev/null
+++ b/test/CodeGen/X86/vec_insert-4.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=x86 -mcpu=yonah | grep 1084227584 | count 1
+
+; ModuleID = '<stdin>'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i686-apple-darwin9.2.2"
+
+define <8 x float> @f(<8 x float> %a, i32 %b) nounwind  {
+entry:
+       %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b              ; <<4 x float>> [#uses=1]
+       ret <8 x float> %vecins
+}
diff --git a/test/CodeGen/X86/vec_insert-9.ll b/test/CodeGen/X86/vec_insert-9.ll

new file mode 100644 (file)

index 0000000..2e829df
--- /dev/null
+++ b/test/CodeGen/X86/vec_insert-9.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=x86 -mattr=+sse41 > %t
+; RUN: grep pinsrd %t | count 2
+
+define <4 x i32> @var_insert2(<4 x i32> %x, i32 %val, i32 %idx) nounwind  {
+entry:
+       %tmp3 = insertelement <4 x i32> undef, i32 %val, i32 0          ; <<4 x i32>> [#uses=1]
+       %tmp4 = insertelement <4 x i32> %tmp3, i32 %idx, i32 3          ; <<4 x i32>> [#uses=1]
+       ret <4 x i32> %tmp4
+}
diff --git a/test/CodeGen/X86/vec_insert_4.ll b/test/CodeGen/X86/vec_insert_4.ll

deleted file mode 100644 (file)

index 2c31e56..0000000
--- a/test/CodeGen/X86/vec_insert_4.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | grep 1084227584 | count 1
-
-; ModuleID = '<stdin>'
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-target triple = "i686-apple-darwin9.2.2"
-
-define <8 x float> @f(<8 x float> %a, i32 %b) nounwind  {
-entry:
-       %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b              ; <<4 x float>> [#uses=1]
-       ret <8 x float> %vecins
-}
diff --git a/test/CodeGen/X86/vec_set.ll b/test/CodeGen/X86/vec_set.ll

index c316df887c16e9cb8ae8e4f5c831ca8bcbd6d815..7f5f8dd213a5cf2fcc71848db6992204c5c58ebe 100644 (file)
--- a/test/CodeGen/X86/vec_set.ll
+++ b/test/CodeGen/X86/vec_set.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep punpckl | count 7
+; RUN: llc < %s -march=x86 -mattr=+sse2,-sse41 | grep punpckl | count 7
  
  define void @test(<8 x i16>* %b, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
          %tmp = insertelement <8 x i16> zeroinitializer, i16 %a0, i32 0          ; <<8 x i16>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_shuffle.ll b/test/CodeGen/X86/vec_shuffle.ll

index c05b79a54a15d8e4d188f5066969dd081bc2e949..2a48de22098faf34a276fab16c01b5373df37292 100644 (file)
--- a/test/CodeGen/X86/vec_shuffle.ll
+++ b/test/CodeGen/X86/vec_shuffle.ll
@@ -1,5 +1,6 @@
  ; RUN: llc < %s -march=x86 -mcpu=core2 -o %t
-; RUN: grep shufp   %t | count 1
+; RUN: grep movq    %t | count 1
+; RUN: grep pshufd  %t | count 1
  ; RUN: grep movupd  %t | count 1
  ; RUN: grep pshufhw %t | count 1
author	Nate Begeman <natebegeman@mac.com>
	Wed, 24 Mar 2010 20:49:50 +0000 (20:49 +0000)
committer	Nate Begeman <natebegeman@mac.com>
	Wed, 24 Mar 2010 20:49:50 +0000 (20:49 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/vec_insert-4.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/X86/vec_insert-9.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/X86/vec_insert_4.ll	[deleted file]	patch \| blob \| history
test/CodeGen/X86/vec_set.ll		patch \| blob \| history
test/CodeGen/X86/vec_shuffle.ll		patch \| blob \| history