[X86] Lower SEXTLOAD using SIGN_EXTEND_VECTOR_INREG. NCI.

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 7da7f80fc9e910c345175b624934d94f00da59ea..b1e1dfa5f79fe9ab62659ce4a00d883d987bbd70 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1785,7 +1785,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    MaxStoresPerMemmoveOptSize = 4;
    setPrefLoopAlignment(4); // 2^4 bytes.
  
-  // Predictable cmov don't hurt on atom because it's in-order.
+  // A predictable cmov does not hurt on an in-order CPU.
+  // FIXME: Use a CPU attribute to trigger this, not a CPU model.
    PredictableSelectIsExpensive = !Subtarget->isAtom();
    EnableExtLdPromotion = true;
    setPrefFunctionAlignment(4); // 2^4 bytes.
@@ -2079,6 +2080,29 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
    return true;
  }
  
+/// Android provides a fixed TLS slot for the SafeStack pointer.
+/// See the definition of TLS_SLOT_SAFESTACK in
+/// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+bool X86TargetLowering::getSafeStackPointerLocation(unsigned &AddressSpace,
+                                                    unsigned &Offset) const {
+  if (!Subtarget->isTargetAndroid())
+    return false;
+
+  if (Subtarget->is64Bit()) {
+    // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
+    Offset = 0x48;
+    if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
+      AddressSpace = 256;
+    else
+      AddressSpace = 257;
+  } else {
+    // %gs:0x24 on i386
+    Offset = 0x24;
+    AddressSpace = 256;
+  }
+  return true;
+}
+
  bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
                                              unsigned DestAS) const {
    assert(SrcAS != DestAS && "Expected different address spaces!");
@@ -2402,7 +2426,7 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
  /// supports tail call optimization.
  static bool IsTailCallConvention(CallingConv::ID CC) {
    return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
-          CC == CallingConv::HiPE);
+          CC == CallingConv::HiPE || CC == CallingConv::HHVM);
  }
  
  /// \brief Return true if the calling convention is a C calling convention.
@@ -2995,7 +3019,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
  
    // Get a count of how many bytes are to be pushed on the stack.
-  unsigned NumBytes = CCInfo.getNextStackOffset();
+  unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
    if (IsSibcall)
      // This is a sibcall. The memory operands are available in caller's
      // own caller's stack.
@@ -3876,6 +3900,10 @@ bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
  /// Callee pop is necessary to support tail calls.
  bool X86::isCalleePop(CallingConv::ID CallingConv,
                        bool is64Bit, bool IsVarArg, bool TailCallOpt) {
+
+  if (IsTailCallConvention(CallingConv))
+    return IsVarArg ? false : TailCallOpt;
+
    switch (CallingConv) {
    default:
      return false;
@@ -3883,12 +3911,6 @@ bool X86::isCalleePop(CallingConv::ID CallingConv,
    case CallingConv::X86_FastCall:
    case CallingConv::X86_ThisCall:
      return !is64Bit;
-  case CallingConv::Fast:
-  case CallingConv::GHC:
-  case CallingConv::HiPE:
-    if (IsVarArg)
-      return false;
-    return TailCallOpt;
    }
  }
  
@@ -7462,13 +7484,15 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
                                                    MVT::v16i8, PSHUFBMask)));
    }
  
-  // If we are extending from an (odd)offset, shuffle them by 1 element.
-  if (Offset & 1) {
+  // If we are extending from an offset, ensure we start on a boundary that
+  // we can unpack from.
+  int AlignToUnpack = Offset % (NumElements / Scale);
+  if (AlignToUnpack) {
      SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
-    for (int i = 1; i < NumElements; ++i)
-      ShMask[i - 1] = i;
+    for (int i = AlignToUnpack; i < NumElements; ++i)
+      ShMask[i - AlignToUnpack] = i;
      InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
-    Offset--;
+    Offset -= AlignToUnpack;
    }
  
    // Otherwise emit a sequence of unpacks.
@@ -12443,6 +12467,15 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
    //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
    //     return (float4) lo + fhi;
  
+  // We shouldn't use it when unsafe-fp-math is enabled though: we might later
+  // reassociate the two FADDs, and if we do that, the algorithm fails
+  // spectacularly (PR24512).
+  // FIXME: If we ever have some kind of Machine FMF, this should be marked
+  // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
+  // there's also the MachineCombiner reassociations happening on Machine IR.
+  if (DAG.getTarget().Options.UnsafeFPMath)
+    return SDValue();
+
    SDLoc DL(Op);
    SDValue V = Op->getOperand(0);
    EVT VecIntVT = V.getValueType();
@@ -12753,7 +12786,7 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
      // for DAG type consistency we have to match the FP operand type.
  
      APFloat Thresh(APFloat::IEEEsingle, APInt(32, 0x5f000000));
-    APFloat::opStatus Status = APFloat::opOK;
+    LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
      bool LosesInfo = false;
      if (TheVT == MVT::f64)
        // The rounding mode is irrelevant as the conversion should be exact.
@@ -15024,29 +15057,12 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
        return Sext;
      }
  
-    // Otherwise we'll shuffle the small elements in the high bits of the
-    // larger type and perform an arithmetic shift. If the shift is not legal
-    // it's better to scalarize.
-    assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) &&
-           "We can't implement a sext load without an arithmetic right shift!");
-
-    // Redistribute the loaded elements into the different locations.
-    SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
-    for (unsigned i = 0; i != NumElems; ++i)
-      ShuffleVec[i * SizeRatio + SizeRatio - 1] = i;
-
-    SDValue Shuff = DAG.getVectorShuffle(
-        WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
-
-    Shuff = DAG.getBitcast(RegVT, Shuff);
-
-    // Build the arithmetic shift.
-    unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
-                   MemVT.getVectorElementType().getSizeInBits();
-    Shuff =
-        DAG.getNode(ISD::SRA, dl, RegVT, Shuff,
-                    DAG.getConstant(Amt, dl, RegVT));
+    // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
+    // lanes.
+    assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
+           "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
  
+    SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
      DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
      return Shuff;
    }
@@ -17869,18 +17885,28 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
  
        // i64 SRA needs to be performed as partial shifts.
        if ((VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
-          Op.getOpcode() == ISD::SRA)
+          Op.getOpcode() == ISD::SRA && !Subtarget->hasXOP())
          return ArithmeticShiftRight64(ShiftAmt);
  
        if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) {
          unsigned NumElts = VT.getVectorNumElements();
          MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
  
-        if (Op.getOpcode() == ISD::SHL) {
-          // Simple i8 add case
-          if (ShiftAmt == 1)
-            return DAG.getNode(ISD::ADD, dl, VT, R, R);
+        // Simple i8 add case
+        if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
+          return DAG.getNode(ISD::ADD, dl, VT, R, R);
  
+        // ashr(R, 7)  === cmp_slt(R, 0)
+        if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
+          SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+          return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
+        }
+
+        // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
+        if (VT == MVT::v16i8 && Subtarget->hasXOP())
+          return SDValue();
+
+        if (Op.getOpcode() == ISD::SHL) {
            // Make a large shift.
            SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
                                                     R, ShiftAmt, DAG);
@@ -17903,12 +17929,6 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
                               DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
          }
          if (Op.getOpcode() == ISD::SRA) {
-          if (ShiftAmt == 7) {
-            // ashr(R, 7)  === cmp_slt(R, 0)
-            SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
-            return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
-          }
-
            // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
            SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
            SmallVector<SDValue, 32> V(NumElts,
@@ -17925,7 +17945,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
    }
  
    // Special case in 32-bit mode, where i64 is expanded into high and low parts.
-  if (!Subtarget->is64Bit() &&
+  if (!Subtarget->is64Bit() && !Subtarget->hasXOP() &&
        (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64))) {
  
      // Peek through any splat that was introduced for i64 shift vectorization.
@@ -18079,11 +18099,26 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
      return V;
  
    if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
-      return V;
+    return V;
  
    if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
      return Op;
  
+  // XOP has 128-bit variable logical/arithmetic shifts.
+  // +ve/-ve Amt = shift left/right.
+  if (Subtarget->hasXOP() &&
+      (VT == MVT::v2i64 || VT == MVT::v4i32 ||
+       VT == MVT::v8i16 || VT == MVT::v16i8)) {
+    if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
+      SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
+      Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
+    }
+    if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
+      return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
+    if (Op.getOpcode() == ISD::SRA)
+      return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
+  }
+
    // 2i64 vector logical shifts can efficiently avoid scalarization - do the
    // shifts per-lane and then shuffle the partial results back together.
    if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
@@ -18272,7 +18307,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
      return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
    }
  
-  if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget->hasInt256())) {
+  if (VT == MVT::v16i8 ||
+      (VT == MVT::v32i8 && Subtarget->hasInt256() && !Subtarget->hasXOP())) {
      MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
      unsigned ShiftOpcode = Op->getOpcode();
  
@@ -18392,7 +18428,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
                         DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
    }
  
-  if (Subtarget->hasInt256() && VT == MVT::v16i16) {
+  if (Subtarget->hasInt256() && !Subtarget->hasXOP() && VT == MVT::v16i16) {
      MVT ExtVT = MVT::v8i32;
      SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
      SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
@@ -19796,6 +19832,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::RDSEED:             return "X86ISD::RDSEED";
    case X86ISD::VPMADDUBSW:         return "X86ISD::VPMADDUBSW";
    case X86ISD::VPMADDWD:           return "X86ISD::VPMADDWD";
+  case X86ISD::VPSHA:              return "X86ISD::VPSHA";
+  case X86ISD::VPSHL:              return "X86ISD::VPSHL";
    case X86ISD::FMADD:              return "X86ISD::FMADD";
    case X86ISD::FMSUB:              return "X86ISD::FMSUB";
    case X86ISD::FNMADD:             return "X86ISD::FNMADD";
@@ -24519,6 +24557,9 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
    if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget))
      return R;
  
+  if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+    return FPLogic;
+
    SDValue N0 = N->getOperand(0);
    SDValue N1 = N->getOperand(1);
    EVT VT = N->getValueType(0);
@@ -24761,6 +24802,9 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
      if (SDValue RV = performIntegerAbsCombine(N, DAG))
        return RV;
  
+  if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+    return FPLogic;
+
    return SDValue();
  }