[x86] Teach the new v4i32 shuffle lowering some more tricks to recognize

author Chandler Carruth <chandlerc@gmail.com>

Thu, 4 Sep 2014 09:26:30 +0000 (09:26 +0000)

committer Chandler Carruth <chandlerc@gmail.com>

Thu, 4 Sep 2014 09:26:30 +0000 (09:26 +0000)
author Chandler Carruth <chandlerc@gmail.com>
Thu, 4 Sep 2014 09:26:30 +0000 (09:26 +0000)
committer Chandler Carruth <chandlerc@gmail.com>
Thu, 4 Sep 2014 09:26:30 +0000 (09:26 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 8415c8f708da28a5a45c1f700e23f922e856839c..f85b00a84f8b621ec53a815830831f7dd20e116b 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -6840,6 +6840,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
          // convert it to a vector with movd (S2V+shuffle to zero extend).
          Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
          Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
          // convert it to a vector with movd (S2V+shuffle to zero extend).
          Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
          Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
+
+        // If using the new shuffle lowering, just directly insert this.
+        if (ExperimentalVectorShuffleLowering)
+          return DAG.getNode(
+              ISD::BITCAST, dl, VT,
+              getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
+
          Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
  
          // Now we have our 32-bit value zero extended in the low element of
          Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
  
          // Now we have our 32-bit value zero extended in the low element of
@@ -6913,6 +6920,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
      if (EVTBits == 32) {
        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
  
      if (EVTBits == 32) {
        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
  
+      // If using the new shuffle lowering, just directly insert this.
+      if (ExperimentalVectorShuffleLowering)
+        return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
+
        // Turn it into a shuffle of zero and zero-extended scalar to vector.
        Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
        SmallVector<int, 8> MaskVec;
        // Turn it into a shuffle of zero and zero-extended scalar to vector.
        Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
        SmallVector<int, 8> MaskVec;
@@ -7492,7 +7503,10 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    ArrayRef<int> Mask = SVOp->getMask();
    assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
  
    ArrayRef<int> Mask = SVOp->getMask();
    assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
  
-  if (isSingleInputShuffleMask(Mask))
+  int NumV2Elements =
+      std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
+
+  if (NumV2Elements == 0)
      // Straight shuffle of a single input vector. For everything from SSE2
      // onward this has a single fast instruction with no scary immediates.
      return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
      // Straight shuffle of a single input vector. For everything from SSE2
      // onward this has a single fast instruction with no scary immediates.
      return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
@@ -7504,6 +7518,52 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
  
    if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
  
+  // There are special ways we can lower some single-element blends.
+  if (NumV2Elements == 1) {
+    int V2Index =
+        std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
+        Mask.begin();
+
+    // Check for a single input from a SCALAR_TO_VECTOR node.
+    // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
+    // all the smarts here sunk into that routine. However, the current
+    // lowering of BUILD_VECTOR makes that nearly impossible until the old
+    // vector shuffle lowering is dead.
+    if ((Mask[V2Index] == 4 && V2.getOpcode() == ISD::SCALAR_TO_VECTOR) ||
+        V2.getOpcode() == ISD::BUILD_VECTOR) {
+      SDValue V2S = V2.getOperand(Mask[V2Index] - 4);
+
+      bool V1IsAllZero = false;
+      if (ISD::isBuildVectorAllZeros(V1.getNode())) {
+        V1IsAllZero = true;
+      } else if (V1.getOpcode() == ISD::BUILD_VECTOR) {
+        V1IsAllZero = true;
+        for (int M : Mask) {
+          if (M < 0 || M >= 4)
+            continue;
+          SDValue Input = V1.getOperand(M);
+          if (Input.getOpcode() != ISD::UNDEF && !X86::isZeroNode(Input)) {
+            // A non-zero input!
+            V1IsAllZero = false;
+            break;
+          }
+        }
+      }
+      if (V1IsAllZero) {
+        V2 = DAG.getNode(
+            X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
+            DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V2S));
+        if (V2Index != 0) {
+          int V2Shuffle[] = {1, 1, 1, 1};
+          V2Shuffle[V2Index] = 0;
+          V2 = DAG.getVectorShuffle(MVT::v4i32, DL, V2,
+                                    DAG.getUNDEF(MVT::v4i32), V2Shuffle);
+        }
+        return V2;
+      }
+    }
+  }
+
    // We implement this with SHUFPS because it can blend from two vectors.
    // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
    // up the inputs, bypassing domain shift penalties that we would encur if we
    // We implement this with SHUFPS because it can blend from two vectors.
    // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
    // up the inputs, bypassing domain shift penalties that we would encur if we
diff --git a/test/CodeGen/X86/vec_set-3.ll b/test/CodeGen/X86/vec_set-3.ll

index bdd852138853844170ffdb67cc8c4430a5308817..9823963d61b7ed675b90a86410910b0346cf3801 100644 (file)
--- a/test/CodeGen/X86/vec_set-3.ll
+++ b/test/CodeGen/X86/vec_set-3.ll
@@ -1,10 +1,15 @@
  ; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn | FileCheck %s
  ; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-EXP
  
  define <4 x float> @test(float %a) {
  ; CHECK-LABEL: test:
  ; CHECK:         movss {{.*}}, %xmm0
  ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
  ; CHECK-NEXT:    retl
  
  define <4 x float> @test(float %a) {
  ; CHECK-LABEL: test:
  ; CHECK:         movss {{.*}}, %xmm0
  ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
  ; CHECK-NEXT:    retl
+;
+; CHECK-EXP-LABEL: test:
+; CHECK-EXP:         insertps $285, {{.*}}, %xmm0
+; CHECK-EXP-NEXT:    retl
  
  entry:
    %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1
  
  entry:
    %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1
@@ -18,6 +23,11 @@ define <2 x i64> @test2(i32 %a) {
  ; CHECK:         movd {{.*}}, %xmm0
  ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
  ; CHECK-NEXT:    retl
  ; CHECK:         movd {{.*}}, %xmm0
  ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
  ; CHECK-NEXT:    retl
+;
+; CHECK-EXP-LABEL: test2:
+; CHECK-EXP:         movd {{.*}}, %xmm0
+; CHECK-EXP-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; CHECK-EXP-NEXT:    retl
  
  entry:
    %tmp7 = insertelement <4 x i32> zeroinitializer, i32 %a, i32 2
  
  entry:
    %tmp7 = insertelement <4 x i32> zeroinitializer, i32 %a, i32 2
@@ -32,6 +42,10 @@ define <4 x float> @test3(<4 x float> %A) {
  ; CHECK-NEXT:    movss %xmm0, %[[X1]]
  ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = [[X1]][1,0,1,1]
  ; CHECK-NEXT:    retl
  ; CHECK-NEXT:    movss %xmm0, %[[X1]]
  ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = [[X1]][1,0,1,1]
  ; CHECK-NEXT:    retl
+;
+; CHECK-EXP-LABEL: test3:
+; CHECK-EXP:         insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
+; CHECK-EXP-NEXT:    retl
  
    %tmp0 = extractelement <4 x float> %A, i32 0
    %tmp1 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef >, float %tmp0, i32 1
  
    %tmp0 = extractelement <4 x float> %A, i32 0
    %tmp1 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef >, float %tmp0, i32 1
diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll

index 7f448835b5d9c73103a58f1927475bca68dbce7c..9105197f67c035d00d2df5706fa1fccb5286cb78 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -317,3 +317,52 @@ define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) {
    %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
    ret <4 x float> %shuffle
  }
    %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
    ret <4 x float> %shuffle
  }
+
+define <4 x i32> @shuffle_v4i32_4zzz(i32 %i) {
+; ALL-LABEL: @shuffle_v4i32_4zzz
+; ALL:         movd {{.*}}, %xmm0
+; ALL-NEXT:    retq
+  %a = insertelement <4 x i32> undef, i32 %i, i32 0
+  %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_z4zz(i32 %i) {
+; ALL-LABEL: @shuffle_v4i32_z4zz
+; ALL:         movd {{.*}}, %xmm0
+; ALL-NEXT:    pshufd {{.*}} # xmm0 = xmm0[1,0,1,1]
+; ALL-NEXT:    retq
+  %a = insertelement <4 x i32> undef, i32 %i, i32 0
+  %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_zz4z(i32 %i) {
+; ALL-LABEL: @shuffle_v4i32_zz4z
+; ALL:         movd {{.*}}, %xmm0
+; ALL-NEXT:    pshufd {{.*}} # xmm0 = xmm0[1,1,0,1]
+; ALL-NEXT:    retq
+  %a = insertelement <4 x i32> undef, i32 %i, i32 0
+  %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_zuu4(i32 %i) {
+; ALL-LABEL: @shuffle_v4i32_zuu4
+; ALL:         movd {{.*}}, %xmm0
+; ALL-NEXT:    pshufd {{.*}} # xmm0 = xmm0[1,1,1,0]
+; ALL-NEXT:    retq
+  %a = insertelement <4 x i32> undef, i32 %i, i32 0
+  %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_z6zz(i32 %i) {
+; ALL-LABEL: @shuffle_v4i32_z6zz
+; ALL:         movd {{.*}}, %xmm0
+; ALL-NEXT:    pshufd {{.*}} # xmm0 = xmm0[1,0,1,1]
+; ALL-NEXT:    retq
+  %a = insertelement <4 x i32> undef, i32 %i, i32 2
+  %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  ret <4 x i32> %shuffle
+}
author	Chandler Carruth <chandlerc@gmail.com>
	Thu, 4 Sep 2014 09:26:30 +0000 (09:26 +0000)
committer	Chandler Carruth <chandlerc@gmail.com>
	Thu, 4 Sep 2014 09:26:30 +0000 (09:26 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/vec_set-3.ll		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-128-v4.ll		patch \| blob \| history