[x86] Fix wrong lowering of vsetcc nodes (PR25080).

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 6fd04733e01dce21372cf0b17700320b3851250d..2c933c7edca5238670f856e171123f02362c1eda 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1607,6 +1607,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Legal);
      setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
      setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i16, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i8, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
      setOperationAction(ISD::SELECT,             MVT::v32i1, Custom);
      setOperationAction(ISD::SELECT,             MVT::v64i1, Custom);
      setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
@@ -1618,6 +1622,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i1, Custom);
      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i1, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i8, Custom);
      setOperationAction(ISD::VSELECT,            MVT::v32i16, Legal);
      setOperationAction(ISD::VSELECT,            MVT::v64i8, Legal);
      setOperationAction(ISD::TRUNCATE,           MVT::v32i1, Custom);
@@ -2695,10 +2701,6 @@ SDValue X86TargetLowering::LowerFormalArguments(
    }
  
    MachineModuleInfo &MMI = MF.getMMI();
-  const Function *WinEHParent = nullptr;
-  if (MMI.hasWinEHFuncInfo(Fn))
-    WinEHParent = MMI.getWinEHParent(Fn);
-  bool IsWinEHParent = WinEHParent && WinEHParent == Fn;
  
    // Figure out if XMM registers are in use.
    assert(!(Subtarget->useSoftFloat() &&
@@ -2855,7 +2857,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
  
    FuncInfo->setArgumentStackSize(StackSize);
  
-  if (IsWinEHParent) {
+  if (MMI.hasWinEHFuncInfo(Fn)) {
      if (Is64Bit) {
        int UnwindHelpFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
        SDValue StackSlot = DAG.getFrameIndex(UnwindHelpFI, MVT::i64);
@@ -3390,9 +3392,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
    const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
    assert(Mask && "Missing call preserved mask for calling convention");
  
-  // If this is an invoke in a 32-bit function using an MSVC personality, assume
-  // the function clobbers all registers. If an exception is thrown, the runtime
-  // will not restore CSRs.
+  // If this is an invoke in a 32-bit function using a funclet-based
+  // personality, assume the function clobbers all registers. If an exception
+  // is thrown, the runtime will not restore CSRs.
    // FIXME: Model this more precisely so that we can register allocate across
    // the normal edge and spill and fill across the exceptional edge.
    if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
@@ -3401,7 +3403,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
          CallerFn->hasPersonalityFn()
              ? classifyEHPersonality(CallerFn->getPersonalityFn())
              : EHPersonality::Unknown;
-    if (isMSVCEHPersonality(Pers))
+    if (isFuncletEHPersonality(Pers))
        Mask = RegInfo->getNoPreservedMask();
    }
  
@@ -4229,7 +4231,7 @@ unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
    return getInsertVINSERTImmediate(N, 256);
  }
  
-/// Returns true if Elt is a constant integer zero
+/// Returns true if V is a constant integer zero.
  static bool isZero(SDValue V) {
    ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
    return C && C->isNullValue();
@@ -11085,8 +11087,13 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
                                      unsigned &MaskValue) {
    MaskValue = 0;
    unsigned NumElems = BuildVector->getNumOperands();
+  
    // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
+  // We don't handle the >2 lanes case right now.
    unsigned NumLanes = (NumElems - 1) / 8 + 1;
+  if (NumLanes > 2)
+    return false;
+
    unsigned NumElemsInLane = NumElems / NumLanes;
  
    // Blend for v16i16 should be symmetric for the both lanes.
@@ -11101,16 +11108,21 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
      if (isa<ConstantSDNode>(SndLaneEltCond))
        Lane2Cond = !isZero(SndLaneEltCond);
  
+    unsigned LaneMask = 0;
      if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
        // Lane1Cond != 0, means we want the first argument.
        // Lane1Cond == 0, means we want the second argument.
        // The encoding of this argument is 0 for the first argument, 1
        // for the second. Therefore, invert the condition.
-      MaskValue |= !Lane1Cond << i;
+      LaneMask = !Lane1Cond << i;
      else if (Lane1Cond < 0)
-      MaskValue |= !Lane2Cond << i;
+      LaneMask = !Lane2Cond << i;
      else
        return false;
+
+    MaskValue |= LaneMask;
+    if (NumLanes == 2)
+      MaskValue |= LaneMask << NumElemsInLane;
    }
    return true;
  }
@@ -12467,6 +12479,15 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
    //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
    //     return (float4) lo + fhi;
  
+  // We shouldn't use it when unsafe-fp-math is enabled though: we might later
+  // reassociate the two FADDs, and if we do that, the algorithm fails
+  // spectacularly (PR24512).
+  // FIXME: If we ever have some kind of Machine FMF, this should be marked
+  // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
+  // there's also the MachineCombiner reassociations happening on Machine IR.
+  if (DAG.getTarget().Options.UnsafeFPMath)
+    return SDValue();
+
    SDLoc DL(Op);
    SDValue V = Op->getOperand(0);
    EVT VecIntVT = V.getValueType();
@@ -14132,6 +14153,35 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
                         DAG.getConstant(SSECC, dl, MVT::i8));
    }
  
+  MVT VTOp0 = Op0.getSimpleValueType();
+  assert(VTOp0 == Op1.getSimpleValueType() &&
+         "Expected operands with same type!");
+  assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
+         "Invalid number of packed elements for source and destination!");
+
+  if (VT.is128BitVector() && VTOp0.is256BitVector()) {
+    // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
+    // legalizer to a wider vector type.  In the case of 'vsetcc' nodes, the
+    // legalizer firstly checks if the first operand in input to the setcc has
+    // a legal type. If so, then it promotes the return type to that same type.
+    // Otherwise, the return type is promoted to the 'next legal type' which,
+    // for a vector of MVT::i1 is always a 128-bit integer vector type.
+    //
+    // We reach this code only if the following two conditions are met:
+    // 1. Both return type and operand type have been promoted to wider types
+    //    by the type legalizer.
+    // 2. The original operand type has been promoted to a 256-bit vector.
+    //
+    // Note that condition 2. only applies for AVX targets.
+    SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
+    return DAG.getZExtOrTrunc(NewOp, dl, VT);
+  }
+
+  // The non-AVX512 code below works under the assumption that source and
+  // destination types are the same.
+  assert((Subtarget->hasAVX512() || (VT == VTOp0)) &&
+         "Value types for source and destination must be the same!");
+
    // Break 256-bit integer vector compare into smaller ones.
    if (VT.is256BitVector() && !Subtarget->hasInt256())
      return Lower256IntVSETCC(Op, DAG);
@@ -14158,6 +14208,33 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
                           DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
    }
  
+  // Lower using XOP integer comparisons.
+  if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
+       VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget->hasXOP()) {
+    // Translate compare code to XOP PCOM compare mode.
+    unsigned CmpMode = 0;
+    switch (SetCCOpcode) {
+    default: llvm_unreachable("Unexpected SETCC condition");
+    case ISD::SETULT:
+    case ISD::SETLT: CmpMode = 0x00; break;
+    case ISD::SETULE:
+    case ISD::SETLE: CmpMode = 0x01; break;
+    case ISD::SETUGT:
+    case ISD::SETGT: CmpMode = 0x02; break;
+    case ISD::SETUGE:
+    case ISD::SETGE: CmpMode = 0x03; break;
+    case ISD::SETEQ: CmpMode = 0x04; break;
+    case ISD::SETNE: CmpMode = 0x05; break;
+    }
+
+    // Are we comparing unsigned or signed integers?
+    unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
+      ? X86ISD::VPCOMU : X86ISD::VPCOM;
+
+    return DAG.getNode(Opc, dl, VT, Op0, Op1,
+                       DAG.getConstant(CmpMode, dl, MVT::i8));
+  }
+
    // We are handling one of the integer comparisons here.  Since SSE only has
    // GT and EQ comparisons for integer, swapping operands and multiple
    // operations may be required for some comparisons.
@@ -15048,29 +15125,12 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
        return Sext;
      }
  
-    // Otherwise we'll shuffle the small elements in the high bits of the
-    // larger type and perform an arithmetic shift. If the shift is not legal
-    // it's better to scalarize.
-    assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) &&
-           "We can't implement a sext load without an arithmetic right shift!");
-
-    // Redistribute the loaded elements into the different locations.
-    SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
-    for (unsigned i = 0; i != NumElems; ++i)
-      ShuffleVec[i * SizeRatio + SizeRatio - 1] = i;
-
-    SDValue Shuff = DAG.getVectorShuffle(
-        WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
-
-    Shuff = DAG.getBitcast(RegVT, Shuff);
-
-    // Build the arithmetic shift.
-    unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
-                   MemVT.getVectorElementType().getSizeInBits();
-    Shuff =
-        DAG.getNode(ISD::SRA, dl, RegVT, Shuff,
-                    DAG.getConstant(Amt, dl, RegVT));
+    // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
+    // lanes.
+    assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
+           "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
  
+    SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
      DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
      return Shuff;
    }
@@ -16011,11 +16071,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                                RoundingMode, Sae),
                                    Mask, Src0, Subtarget, DAG);
      }
-    case INTR_TYPE_2OP_MASK: {
+    case INTR_TYPE_2OP_MASK:
+    case INTR_TYPE_2OP_IMM8_MASK: {
        SDValue Src1 = Op.getOperand(1);
        SDValue Src2 = Op.getOperand(2);
        SDValue PassThru = Op.getOperand(3);
        SDValue Mask = Op.getOperand(4);
+
+      if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
+        Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
+
        // We specify 2 possible opcodes for intrinsics with rounding modes.
        // First, we check if the intrinsic may have non-default rounding mode,
        // (IntrData->Opc1 != 0), then we check the rounding mode operand.
@@ -19842,6 +19907,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::VPMADDWD:           return "X86ISD::VPMADDWD";
    case X86ISD::VPSHA:              return "X86ISD::VPSHA";
    case X86ISD::VPSHL:              return "X86ISD::VPSHL";
+  case X86ISD::VPCOM:              return "X86ISD::VPCOM";
+  case X86ISD::VPCOMU:             return "X86ISD::VPCOMU";
    case X86ISD::FMADD:              return "X86ISD::FMADD";
    case X86ISD::FMSUB:              return "X86ISD::FMSUB";
    case X86ISD::FNMADD:             return "X86ISD::FNMADD";
@@ -21953,10 +22020,22 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
    MVT RootVT = Root.getSimpleValueType();
    SDLoc DL(Root);
  
-  // Just remove no-op shuffle masks.
    if (Mask.size() == 1) {
-    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input),
-                  /*AddTo*/ true);
+    int Index = Mask[0];
+    assert((Index >= 0 || Index == SM_SentinelUndef ||
+            Index == SM_SentinelZero) &&
+           "Invalid shuffle index found!");
+
+    // We may end up with an accumulated mask of size 1 as a result of
+    // widening of shuffle operands (see function canWidenShuffleElements).
+    // If the only shuffle index is equal to SM_SentinelZero then propagate
+    // a zero vector. Otherwise, the combine shuffle mask is a no-op shuffle
+    // mask, and therefore the entire chain of shuffles can be folded away.
+    if (Index == SM_SentinelZero)
+      DCI.CombineTo(Root.getNode(), getZeroVector(RootVT, Subtarget, DAG, DL));
+    else
+      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input),
+                    /*AddTo*/ true);
      return true;
    }
  
@@ -22900,15 +22979,15 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
                           InputVector.getNode()->getOperand(0));
  
      // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))).
-    SDValue MMXSrcOp = MMXSrc.getOperand(0);
      if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() &&
-        MMXSrc.getValueType() == MVT::i64 && MMXSrcOp.hasOneUse() &&
-        MMXSrcOp.getOpcode() == ISD::BITCAST &&
-        MMXSrcOp.getValueType() == MVT::v1i64 &&
-        MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
-      return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
-                         N->getValueType(0),
-                         MMXSrcOp.getOperand(0));
+        MMXSrc.getValueType() == MVT::i64) {
+      SDValue MMXSrcOp = MMXSrc.getOperand(0);
+      if (MMXSrcOp.hasOneUse() && MMXSrcOp.getOpcode() == ISD::BITCAST &&
+          MMXSrcOp.getValueType() == MVT::v1i64 &&
+          MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
+        return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
+                           N->getValueType(0), MMXSrcOp.getOperand(0));
+    }
    }
  
    EVT VT = N->getValueType(0);
@@ -25686,10 +25765,7 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
      }
    }
  
-  if (!Subtarget->hasFp256())
-    return SDValue();
-
-  if (VT.isVector() && VT.getSizeInBits() == 256)
+  if (Subtarget->hasAVX() && VT.isVector() && VT.getSizeInBits() == 256)
      if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
        return R;