[X86][SSE] Shuffle blends with zero

author Simon Pilgrim <llvm-dev@redking.me.uk>

Thu, 29 Oct 2015 22:11:28 +0000 (22:11 +0000)

committer Simon Pilgrim <llvm-dev@redking.me.uk>

Thu, 29 Oct 2015 22:11:28 +0000 (22:11 +0000)
author Simon Pilgrim <llvm-dev@redking.me.uk>
Thu, 29 Oct 2015 22:11:28 +0000 (22:11 +0000)
committer Simon Pilgrim <llvm-dev@redking.me.uk>
Thu, 29 Oct 2015 22:11:28 +0000 (22:11 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 08db5b89e96e97c6af8843f0fb65cd6e57308909..494121e540c22171b04f0359b8e486bf03a9a171 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -6859,22 +6859,62 @@ static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
  /// This doesn't do any checks for the availability of instructions for blending
  /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
  /// be matched in the backend with the type given. What it does check for is
  /// This doesn't do any checks for the availability of instructions for blending
  /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
  /// be matched in the backend with the type given. What it does check for is
-/// that the shuffle mask is in fact a blend.
+/// that the shuffle mask is a blend, or convertible into a blend with zero.
  static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
  static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
-                                         SDValue V2, ArrayRef<int> Mask,
+                                         SDValue V2, ArrayRef<int> Original,
                                           const X86Subtarget *Subtarget,
                                           SelectionDAG &DAG) {
                                           const X86Subtarget *Subtarget,
                                           SelectionDAG &DAG) {
+  bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
+  bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
+  SmallVector<int, 8> Mask(Original.begin(), Original.end());
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  bool ForceV1Zero = false, ForceV2Zero = false;
+
+  // Attempt to generate the binary blend mask. If an input is zero then
+  // we can use any lane.
+  // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
    unsigned BlendMask = 0;
    for (int i = 0, Size = Mask.size(); i < Size; ++i) {
    unsigned BlendMask = 0;
    for (int i = 0, Size = Mask.size(); i < Size; ++i) {
-    if (Mask[i] >= Size) {
-      if (Mask[i] != i + Size)
-        return SDValue(); // Shuffled V2 input!
+    int M = Mask[i];
+    if (M < 0)
+      continue;
+    if (M == i)
+      continue;
+    if (M == i + Size) {
        BlendMask |= 1u << i;
        continue;
      }
        BlendMask |= 1u << i;
        continue;
      }
-    if (Mask[i] >= 0 && Mask[i] != i)
-      return SDValue(); // Shuffled V1 input!
+    if (Zeroable[i]) {
+      if (V1IsZero) {
+        ForceV1Zero = true;
+        Mask[i] = i;
+        continue;
+      }
+      if (V2IsZero) {
+        ForceV2Zero = true;
+        BlendMask |= 1u << i;
+        Mask[i] = i + Size;
+        continue;
+      }
+    }
+    return SDValue(); // Shuffled input!
    }
    }
+
+  // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
+  if (ForceV1Zero)
+    V1 = getZeroVector(VT, Subtarget, DAG, DL);
+  if (ForceV2Zero)
+    V2 = getZeroVector(VT, Subtarget, DAG, DL);
+
+  auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) {
+    unsigned ScaledMask = 0;
+    for (int i = 0; i != Size; ++i)
+      if (BlendMask & (1u << i))
+        for (int j = 0; j != Scale; ++j)
+          ScaledMask |= 1u << (i * Scale + j);
+    return ScaledMask;
+  };
+
    switch (VT.SimpleTy) {
    case MVT::v2f64:
    case MVT::v4f32:
    switch (VT.SimpleTy) {
    case MVT::v2f64:
    case MVT::v4f32:
@@ -6894,12 +6934,7 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
      if (Subtarget->hasAVX2()) {
        // Scale the blend by the number of 32-bit dwords per element.
        int Scale =  VT.getScalarSizeInBits() / 32;
      if (Subtarget->hasAVX2()) {
        // Scale the blend by the number of 32-bit dwords per element.
        int Scale =  VT.getScalarSizeInBits() / 32;
-      BlendMask = 0;
-      for (int i = 0, Size = Mask.size(); i < Size; ++i)
-        if (Mask[i] >= Size)
-          for (int j = 0; j < Scale; ++j)
-            BlendMask |= 1u << (i * Scale + j);
-
+      BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
        MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
        V1 = DAG.getBitcast(BlendVT, V1);
        V2 = DAG.getBitcast(BlendVT, V2);
        MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
        V1 = DAG.getBitcast(BlendVT, V1);
        V2 = DAG.getBitcast(BlendVT, V2);
@@ -6912,12 +6947,7 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
      // For integer shuffles we need to expand the mask and cast the inputs to
      // v8i16s prior to blending.
      int Scale = 8 / VT.getVectorNumElements();
      // For integer shuffles we need to expand the mask and cast the inputs to
      // v8i16s prior to blending.
      int Scale = 8 / VT.getVectorNumElements();
-    BlendMask = 0;
-    for (int i = 0, Size = Mask.size(); i < Size; ++i)
-      if (Mask[i] >= Size)
-        for (int j = 0; j < Scale; ++j)
-          BlendMask |= 1u << (i * Scale + j);
-
+    BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
      V1 = DAG.getBitcast(MVT::v8i16, V1);
      V2 = DAG.getBitcast(MVT::v8i16, V2);
      return DAG.getBitcast(VT,
      V1 = DAG.getBitcast(MVT::v8i16, V1);
      V2 = DAG.getBitcast(MVT::v8i16, V2);
      return DAG.getBitcast(VT,
diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll

index e49941e221c9ef23c3c8dc8828f36664765bedb9..636209d948b82bf25d0b604c8ef8b5dbc9a587ef 100644 (file)
--- a/test/CodeGen/X86/avx-vperm2x128.ll
+++ b/test/CodeGen/X86/avx-vperm2x128.ll
@@ -307,7 +307,8 @@ define <4 x double> @vperm2z_0x38(<4 x double> %a) {
  define <4 x double> @vperm2z_0x80(<4 x double> %a) {
  ; ALL-LABEL: vperm2z_0x80:
  ; ALL:       ## BB#0:
  define <4 x double> @vperm2z_0x80(<4 x double> %a) {
  ; ALL-LABEL: vperm2z_0x80:
  ; ALL:       ## BB#0:
-; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],zero,zero
+; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
  ; ALL-NEXT:    retq
    %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
    ret <4 x double> %s
  ; ALL-NEXT:    retq
    %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
    ret <4 x double> %s
@@ -325,7 +326,8 @@ define <4 x double> @vperm2z_0x81(<4 x double> %a) {
  define <4 x double> @vperm2z_0x82(<4 x double> %a) {
  ; ALL-LABEL: vperm2z_0x82:
  ; ALL:       ## BB#0:
  define <4 x double> @vperm2z_0x82(<4 x double> %a) {
  ; ALL-LABEL: vperm2z_0x82:
  ; ALL:       ## BB#0:
-; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],zero,zero
+; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
  ; ALL-NEXT:    retq
    %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
    ret <4 x double> %s
  ; ALL-NEXT:    retq
    %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
    ret <4 x double> %s
@@ -343,10 +345,21 @@ define <4 x double> @vperm2z_0x83(<4 x double> %a) {
  ;; With AVX2 select the integer version of the instruction. Use an add to force the domain selection.
  
  define <4 x i64> @vperm2z_int_0x83(<4 x i64> %a, <4 x i64> %b) {
  ;; With AVX2 select the integer version of the instruction. Use an add to force the domain selection.
  
  define <4 x i64> @vperm2z_int_0x83(<4 x i64> %a, <4 x i64> %b) {
-; ALL-LABEL: vperm2z_int_0x83:
-; ALL:       ## BB#0:
-; AVX1:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
-; AVX2:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
+; AVX1-LABEL: vperm2z_int_0x83:
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vperm2z_int_0x83:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
+; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
    %s = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
    %c = add <4 x i64> %b, %s
    ret <4 x i64> %c
    %s = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
    %c = add <4 x i64> %b, %s
    ret <4 x i64> %c
diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll

index 1d26d72d6fbcf4d61f88ad97287b22edec23f3e9..0f4ada5910befd1f99a22b4f97f11ed34724b493 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -952,6 +952,43 @@ define <4 x float> @shuffle_v4f32_0zz3(<4 x float> %a) {
    ret <4 x float> %shuffle
  }
  
    ret <4 x float> %shuffle
  }
  
+define <4 x float> @shuffle_v4f32_0z2z(<4 x float> %v) {
+; SSE2-LABEL: shuffle_v4f32_0z2z:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4f32_0z2z:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4f32_0z2z:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4f32_0z2z:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_0z2z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %v, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 2, i32 4>
+  ret <4 x float> %shuffle
+}
+
  define <4 x float> @shuffle_v4f32_u051(<4 x float> %a, <4 x float> %b) {
  ; SSE-LABEL: shuffle_v4f32_u051:
  ; SSE:       # BB#0:
  define <4 x float> @shuffle_v4f32_u051(<4 x float> %a, <4 x float> %b) {
  ; SSE-LABEL: shuffle_v4f32_u051:
  ; SSE:       # BB#0:
author	Simon Pilgrim <llvm-dev@redking.me.uk>
	Thu, 29 Oct 2015 22:11:28 +0000 (22:11 +0000)
committer	Simon Pilgrim <llvm-dev@redking.me.uk>
	Thu, 29 Oct 2015 22:11:28 +0000 (22:11 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/avx-vperm2x128.ll		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-128-v4.ll		patch \| blob \| history