[X86] Lower VSELECT into SHRUNKBLEND when we shrink the bits used into the

author Quentin Colombet <qcolombet@apple.com>

Thu, 6 Nov 2014 02:25:03 +0000 (02:25 +0000)

committer Quentin Colombet <qcolombet@apple.com>

Thu, 6 Nov 2014 02:25:03 +0000 (02:25 +0000)
author Quentin Colombet <qcolombet@apple.com>
Thu, 6 Nov 2014 02:25:03 +0000 (02:25 +0000)
committer Quentin Colombet <qcolombet@apple.com>
Thu, 6 Nov 2014 02:25:03 +0000 (02:25 +0000)
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp

index 15b60ba5bf6edc9b254e20edc952ee2ba2ccf147..3ef7b2c7697bbeb6e8b95cfcce4a22d729ed6b47 100644 (file)
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2131,6 +2131,16 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
    case X86ISD::GlobalBaseReg:
      return getGlobalBaseReg();
  
+  case X86ISD::SHRUNKBLEND: {
+    // SHRUNKBLEND selects like a regular VSELECT.
+    SDValue VSelect = CurDAG->getNode(
+        ISD::VSELECT, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
+        Node->getOperand(1), Node->getOperand(2));
+    ReplaceUses(SDValue(Node, 0), VSelect);
+    SelectCode(VSelect.getNode());
+    // We already called ReplaceUses.
+    return nullptr;
+  }
  
    case ISD::ATOMIC_LOAD_XOR:
    case ISD::ATOMIC_LOAD_AND:
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index bde948174d716f7efb4a62f263e030aad441e7e1..53d9f9134299d741ce9c79d52bdb8ac5b1de993b 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -19025,6 +19025,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::ANDNP:              return "X86ISD::ANDNP";
    case X86ISD::PSIGN:              return "X86ISD::PSIGN";
    case X86ISD::BLENDI:             return "X86ISD::BLENDI";
+  case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
    case X86ISD::SUBUS:              return "X86ISD::SUBUS";
    case X86ISD::HADD:               return "X86ISD::HADD";
    case X86ISD::HSUB:               return "X86ISD::HSUB";
@@ -22618,22 +22619,17 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
        // build_vector of constants. This will be taken care in a later
        // condition.
        (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
-       VT != MVT::v8i16)) {
+       VT != MVT::v8i16) &&
+      // Don't optimize vector of constants. Those are handled by
+      // the generic code and all the bits must be properly set for
+      // the generic optimizer.
+      !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
      unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
  
      // Don't optimize vector selects that map to mask-registers.
      if (BitWidth == 1)
        return SDValue();
  
-    // Check all uses of that condition operand to check whether it will be
-    // consumed by non-BLEND instructions, which may depend on all bits are set
-    // properly.
-    for (SDNode::use_iterator I = Cond->use_begin(),
-                              E = Cond->use_end(); I != E; ++I)
-      if (I->getOpcode() != ISD::VSELECT)
-        // TODO: Add other opcodes eventually lowered into BLEND.
-        return SDValue();
-
      assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
      APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
  
@@ -22641,13 +22637,45 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
      TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
                                            DCI.isBeforeLegalizeOps());
      if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
-        (TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
-                                  TLO) &&
-         // Don't optimize vector of constants. Those are handled by
-         // the generic code and all the bits must be properly set for
-         // the generic optimizer.
-         !ISD::isBuildVectorOfConstantSDNodes(TLO.New.getNode())))
-      DCI.CommitTargetLoweringOpt(TLO);
+        TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
+                                 TLO)) {
+      // If we changed the computation somewhere in the DAG, this change
+      // will affect all users of Cond.
+      // Make sure it is fine and update all the nodes so that we do not
+      // use the generic VSELECT anymore. Otherwise, we may perform
+      // wrong optimizations as we messed up with the actual expectation
+      // for the vector boolean values.
+      if (Cond != TLO.Old) {
+        // Check all uses of that condition operand to check whether it will be
+        // consumed by non-BLEND instructions, which may depend on all bits are
+        // set properly.
+        for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
+             I != E; ++I)
+          if (I->getOpcode() != ISD::VSELECT)
+            // TODO: Add other opcodes eventually lowered into BLEND.
+            return SDValue();
+
+        // Update all the users of the condition, before committing the change,
+        // so that the VSELECT optimizations that expect the correct vector
+        // boolean value will not be triggered.
+        for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
+             I != E; ++I)
+          DAG.ReplaceAllUsesOfValueWith(
+              SDValue(*I, 0),
+              DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
+                          Cond, I->getOperand(1), I->getOperand(2)));
+        DCI.CommitTargetLoweringOpt(TLO);
+        return SDValue();
+      }
+      // At this point, only Cond is changed. Change the condition
+      // just for N to keep the opportunity to optimize all other
+      // users their own way.
+      DAG.ReplaceAllUsesOfValueWith(
+          SDValue(N, 0),
+          DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
+                      TLO.New, N->getOperand(1), N->getOperand(2)));
+      return SDValue();
+    }
    }
  
    // We should generate an X86ISD::BLENDI from a vselect if its argument
@@ -22661,7 +22689,9 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
    // Iff we find this pattern and the build_vectors are built from
    // constants, we translate the vselect into a shuffle_vector that we
    // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
-  if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) {
+  if ((N->getOpcode() == ISD::VSELECT ||
+       N->getOpcode() == X86ISD::SHRUNKBLEND) &&
+      !DCI.isBeforeLegalize()) {
      SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
      if (Shuffle.getNode())
        return Shuffle;
@@ -24854,7 +24884,9 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case ISD::EXTRACT_VECTOR_ELT:
      return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
    case ISD::VSELECT:
-  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, DCI, Subtarget);
+  case ISD::SELECT:
+  case X86ISD::SHRUNKBLEND:
+    return PerformSELECTCombine(N, DAG, DCI, Subtarget);
    case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
    case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
    case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h

index 35e132b944a19d90841d29753cf77420e6b28460..2737703bd6db53f32ac0ce0c4960c2c78ec8609f 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -190,6 +190,11 @@ namespace llvm {
        /// BLENDI - Blend where the selector is an immediate.
        BLENDI,
  
+      /// SHRUNKBLEND - Blend where the condition has been shrunk.
+      /// This is used to emphasize that the condition mask is
+      /// no more valid for generic VSELECT optimizations.
+      SHRUNKBLEND,
+
        /// ADDSUB - Combined add and sub on an FP vector.
        ADDSUB,
  
diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll

index 7022b71336fdfd44ec326268b2045798f50276dc..0a3ed7e4b32eba6eb72aae4289a744e84abd946c 100644 (file)
--- a/test/CodeGen/X86/vector-blend.ll
+++ b/test/CodeGen/X86/vector-blend.ll
@@ -315,9 +315,9 @@ define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) {
  ; SSE41-LABEL: vsel_double8:
  ; SSE41:       # BB#0: # %entry
  ; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm4[1]
-; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm5[0,1]
  ; SSE41-NEXT:    blendpd {{.*#+}} xmm2 = xmm2[0],xmm6[1]
-; SSE41-NEXT:    blendpd {{.*#+}} xmm3 = xmm7[0,1]
+; SSE41-NEXT:    movaps %xmm5, %xmm1
+; SSE41-NEXT:    movaps %xmm7, %xmm3
  ; SSE41-NEXT:    retq
  ;
  ; AVX-LABEL: vsel_double8:
@@ -353,10 +353,10 @@ define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
  ;
  ; SSE41-LABEL: vsel_i648:
  ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm4[1]
-; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm5[0,1]
-; SSE41-NEXT:    blendpd {{.*#+}} xmm2 = xmm2[0],xmm6[1]
-; SSE41-NEXT:    blendpd {{.*#+}} xmm3 = xmm7[0,1]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
+; SSE41-NEXT:    movaps %xmm5, %xmm1
+; SSE41-NEXT:    movaps %xmm7, %xmm3
  ; SSE41-NEXT:    retq
  ;
  ; AVX1-LABEL: vsel_i648:
diff --git a/test/CodeGen/X86/vselect-avx.ll b/test/CodeGen/X86/vselect-avx.ll

index 7926b0c71f12c8793fb397bcca8e104e61c2d1b4..0c0f4bbf992aa31ff1d559fc09791e2758aab965 100644 (file)
--- a/test/CodeGen/X86/vselect-avx.ll
+++ b/test/CodeGen/X86/vselect-avx.ll
@@ -25,3 +25,61 @@ body:
    store <4 x i16> %predphi42, <4 x i16>* %b, align 8
    ret void
  }
+
+; Improve code coverage.
+;
+; When shrinking the condition used into the select to match a blend, this
+; test case exercises the path where the modified node is not the root
+; of the condition.
+;
+; CHECK-LABEL: test2:
+; CHECK:       vpslld  $31, %xmm0, %xmm0
+; CHECK-NEXT:  vpmovsxdq       %xmm0, %xmm1
+; CHECK-NEXT:  vpshufd $78, %xmm0, %xmm0       ## xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT:  vpmovsxdq       %xmm0, %xmm0
+; CHECK-NEXT:  vinsertf128     $1, %xmm0, %ymm1, [[MASK:%ymm[0-9]+]]
+; CHECK: vblendvpd     [[MASK]]
+; CHECK: retq
+define void @test2(double** %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) {
+bb:
+  %arrayidx1928 = getelementptr inbounds double** %call1559, i64 %indvars.iv4198
+  %tmp1888 = load double** %arrayidx1928, align 8
+  %predphi.v.v = select <4 x i1> %tmp1895, <4 x double> <double -5.000000e-01, double -5.000000e-01, double -5.000000e-01, double -5.000000e-01>, <4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+  %tmp1900 = bitcast double* %tmp1888 to <4 x double>*
+  store <4 x double> %predphi.v.v, <4 x double>* %tmp1900, align 8
+  ret void
+}
+
+; For this test, we used to optimized the conditional mask for the blend, i.e.,
+; we shrunk some of its bits.
+; However, this same mask was used in another select (%predphi31) that turned out
+; to be optimized into a and. In that case, the conditional mask was wrong.
+;
+; Make sure that the and is fed by the original mask.
+; 
+; <rdar://problem/18819506>
+
+; Note: For now, hard code ORIG_MASK and SHRUNK_MASK registers, because we
+; cannot express that ORIG_MASK must not be equal to ORIG_MASK. Otherwise,
+; even a faulty pattern would pass!
+;  
+; CHECK-LABEL: test3:
+; Compute the original mask.
+;      CHECK: vpcmpeqd {{%xmm[0-9]+}}, {{%xmm[0-9]+}}, [[ORIG_MASK:%xmm0]]
+; Shrink the bit of the mask.
+; CHECK-NEXT: vpslld   $31, [[ORIG_MASK]], [[SHRUNK_MASK:%xmm3]]
+; Use the shrunk mask in the blend.
+; CHECK-NEXT:  vblendvps       [[SHRUNK_MASK]], %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
+; Use the original mask in the and.
+; CHECK-NEXT: vpand LCPI2_2(%rip), [[ORIG_MASK]], {{%xmm[0-9]+}} 
+; CHECK: retq
+define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17,  <4 x i16> %tmp3, <4 x i16> %tmp12) {
+  %tmp6 = srem <4 x i32> %induction30, <i32 3, i32 3, i32 3, i32 3>
+  %tmp7 = icmp eq <4 x i32> %tmp6, zeroinitializer
+  %predphi = select <4 x i1> %tmp7, <4 x i16> %tmp3, <4 x i16> %tmp12
+  %predphi31 = select <4 x i1> %tmp7, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> zeroinitializer
+
+  store <4 x i16> %predphi31, <4 x i16>* %tmp16, align 8
+  store <4 x i16> %predphi, <4 x i16>* %tmp17, align 8
+ ret void
+}
author	Quentin Colombet <qcolombet@apple.com>
	Thu, 6 Nov 2014 02:25:03 +0000 (02:25 +0000)
committer	Quentin Colombet <qcolombet@apple.com>
	Thu, 6 Nov 2014 02:25:03 +0000 (02:25 +0000)
lib/Target/X86/X86ISelDAGToDAG.cpp		patch \| blob \| history
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
lib/Target/X86/X86ISelLowering.h		patch \| blob \| history
test/CodeGen/X86/vector-blend.ll		patch \| blob \| history
test/CodeGen/X86/vselect-avx.ll		patch \| blob \| history