Rename llvm.frameescape and llvm.framerecover to localescape and localrecover

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 458fa47363aab8b32d7dc3951356dfc1d6c68a8f..1b37912cf615f9117c07189d32ff8cd287a0e5d3 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1032,6 +1032,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
      setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
  
+    setOperationAction(ISD::SRA,               MVT::v2i64, Custom);
      setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
    }
  
@@ -1211,6 +1212,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
      setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
  
+    setOperationAction(ISD::SRA,               MVT::v4i64, Custom);
      setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
  
      // Custom lower several nodes for 256-bit types.
@@ -3938,6 +3940,15 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const {
    return Subtarget->hasLZCNT();
  }
  
+/// isUndefInRange - Return true if every element in Mask, beginning
+/// from position Pos and ending in Pos+Size is undef.
+static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
+  for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
+    if (0 <= Mask[i])
+      return false;
+  return true;
+}
+
  /// isUndefOrInRange - Return true if Val is undef or if its value falls within
  /// the specified range (L, H].
  static bool isUndefOrInRange(int Val, int Low, int Hi) {
@@ -4379,6 +4390,7 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
  /// IsUnary to true if only uses one source. Note that this will set IsUnary for
  /// shuffles which use a single input multiple times, and in those cases it will
  /// adjust the mask to only have indices within that single input.
+/// FIXME: Add support for Decode*Mask functions that return SM_SentinelZero.
  static bool getTargetShuffleMask(SDNode *N, MVT VT,
                                   SmallVectorImpl<int> &Mask, bool &IsUnary) {
    unsigned NumElems = VT.getVectorNumElements();
@@ -4508,6 +4520,10 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
      ImmN = N->getOperand(N->getNumOperands()-1);
      DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
      if (Mask.empty()) return false;
+    // Mask only contains negative index if an element is zero.
+    if (std::any_of(Mask.begin(), Mask.end(), 
+                    [](int M){ return M == SM_SentinelZero; }))
+      return false;
      break;
    case X86ISD::MOVSLDUP:
      DecodeMOVSLDUPMask(VT, Mask);
@@ -6914,6 +6930,136 @@ static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
    return SDValue();
  }
  
+/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
+static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1,
+                                           SDValue V2, ArrayRef<int> Mask,
+                                           SelectionDAG &DAG) {
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  assert(!Zeroable.all() && "Fully zeroable shuffle mask");
+
+  int Size = Mask.size();
+  int HalfSize = Size / 2;
+  assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+  // Upper half must be undefined.
+  if (!isUndefInRange(Mask, HalfSize, HalfSize))
+    return SDValue();
+
+  // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
+  // Remainder of lower half result is zero and upper half is all undef.
+  auto LowerAsEXTRQ = [&]() {
+    // Determine the extraction length from the part of the
+    // lower half that isn't zeroable.
+    int Len = HalfSize;
+    for (; Len >= 0; --Len)
+      if (!Zeroable[Len - 1])
+        break;
+    assert(Len > 0 && "Zeroable shuffle mask");
+
+    // Attempt to match first Len sequential elements from the lower half.
+    SDValue Src;
+    int Idx = -1;
+    for (int i = 0; i != Len; ++i) {
+      int M = Mask[i];
+      if (M < 0)
+        continue;
+      SDValue &V = (M < Size ? V1 : V2);
+      M = M % Size;
+
+      // All mask elements must be in the lower half.
+      if (M > HalfSize)
+        return SDValue();
+
+      if (Idx < 0 || (Src == V && Idx == (M - i))) {
+        Src = V;
+        Idx = M - i;
+        continue;
+      }
+      return SDValue();
+    }
+
+    if (Idx < 0)
+      return SDValue();
+
+    assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
+    int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+    int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+    return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
+                       DAG.getConstant(BitLen, DL, MVT::i8),
+                       DAG.getConstant(BitIdx, DL, MVT::i8));
+  };
+
+  if (SDValue ExtrQ = LowerAsEXTRQ())
+    return ExtrQ;
+
+  // INSERTQ: Extract lowest Len elements from lower half of second source and
+  // insert over first source, starting at Idx.
+  // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
+  auto LowerAsInsertQ = [&]() {
+    for (int Idx = 0; Idx != HalfSize; ++Idx) {
+      SDValue Base;
+
+      // Attempt to match first source from mask before insertion point.
+      if (isUndefInRange(Mask, 0, Idx)) {
+        /* EMPTY */
+      } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
+        Base = V1;
+      } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
+        Base = V2;
+      } else {
+        continue;
+      }
+
+      // Extend the extraction length looking to match both the insertion of
+      // the second source and the remaining elements of the first.
+      for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
+        SDValue Insert;
+        int Len = Hi - Idx;
+
+        // Match insertion.
+        if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
+          Insert = V1;
+        } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
+          Insert = V2;
+        } else {
+          continue;
+        }
+
+        // Match the remaining elements of the lower half.
+        if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
+          /* EMPTY */
+        } else if ((!Base || (Base == V1)) &&
+                   isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
+          Base = V1;
+        } else if ((!Base || (Base == V2)) &&
+                   isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
+                                              Size + Hi)) {
+          Base = V2;
+        } else {
+          continue;
+        }
+
+        // We may not have a base (first source) - this can safely be undefined.
+        if (!Base)
+          Base = DAG.getUNDEF(VT);
+
+        int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+        int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+        return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
+                           DAG.getConstant(BitLen, DL, MVT::i8),
+                           DAG.getConstant(BitIdx, DL, MVT::i8));
+      }
+    }
+
+    return SDValue();
+  };
+
+  if (SDValue InsertQ = LowerAsInsertQ())
+    return InsertQ;
+
+  return SDValue();
+}
+
  /// \brief Lower a vector shuffle as a zero or any extension.
  ///
  /// Given a specific number of elements, element bit width, and extension
@@ -6921,7 +7067,7 @@ static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
  /// features of the subtarget.
  static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
      SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
-    const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+    ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) {
    assert(Scale > 1 && "Need a scale to extend.");
    int NumElements = VT.getVectorNumElements();
    int EltBits = VT.getScalarSizeInBits();
@@ -6958,6 +7104,28 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
                          getV4X86ShuffleImm8ForMask(PSHUFHWMask, DL, DAG)));
    }
  
+  // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
+  // to 64-bits.
+  if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) {
+    assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
+    assert(VT.getSizeInBits() == 128 && "Unexpected vector width!");
+
+    SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+                             DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+                                         DAG.getConstant(EltBits, DL, MVT::i8),
+                                         DAG.getConstant(0, DL, MVT::i8)));
+    if (isUndefInRange(Mask, NumElements/2, NumElements/2))
+      return DAG.getNode(ISD::BITCAST, DL, VT, Lo);
+
+    SDValue Hi =
+        DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+                    DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+                                DAG.getConstant(EltBits, DL, MVT::i8),
+                                DAG.getConstant(EltBits, DL, MVT::i8)));
+    return DAG.getNode(ISD::BITCAST, DL, VT,
+                       DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
+  }
+
    // If this would require more than 2 unpack instructions to expand, use
    // pshufb when available. We can only use more than 2 unpack instructions
    // when zero extending i8 elements which also makes it easier to use pshufb.
@@ -7048,7 +7216,7 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
        return SDValue();
  
      return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
-        DL, VT, Scale, AnyExt, InputV, Subtarget, DAG);
+        DL, VT, Scale, AnyExt, InputV, Mask, Subtarget, DAG);
    };
  
    // The widest scale possible for extending is to a 64-bit integer.
@@ -8575,6 +8743,11 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
            lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG))
      return Shift;
  
+  // See if we can use SSE4A Extraction / Insertion.
+  if (Subtarget->hasSSE4A())
+    if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG))
+      return V;
+
    // There are special ways we can lower some single-element blends.
    if (NumV2Inputs == 1)
      if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2,
@@ -8727,6 +8900,11 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
            DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
      return ZExt;
  
+  // See if we can use SSE4A Extraction / Insertion.
+  if (Subtarget->hasSSE4A())
+    if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG))
+      return V;
+
    int NumV2Elements =
        std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
  
@@ -15091,7 +15269,7 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
            GlobalValue::getRealLinkageName(Fn->getName()));
    SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
    SDValue RegNodeFrameOffset =
-      DAG.getNode(ISD::FRAME_ALLOC_RECOVER, dl, PtrVT, OffsetSymVal);
+      DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
  
    // RegNodeBase = EntryEBP - RegNodeSize
    // ParentFP = RegNodeBase - RegNodeFrameOffset
@@ -15116,6 +15294,9 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
      case INTR_TYPE_3OP:
        return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
          Op.getOperand(2), Op.getOperand(3));
+    case INTR_TYPE_4OP:
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
+        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
      case INTR_TYPE_1OP_MASK_RM: {
        SDValue Src = Op.getOperand(1);
        SDValue PassThru = Op.getOperand(2);
@@ -15772,6 +15953,9 @@ static SDValue LowerSEHRESTOREFRAME(SDValue Op, const X86Subtarget *Subtarget,
    SDLoc dl(Op);
    SDValue Chain = Op.getOperand(0);
  
+  assert(Subtarget->getFrameLowering()->hasFP(MF) &&
+         "using llvm.x86.seh.restoreframe requires a frame pointer");
+
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
    MVT VT = TLI.getPointerTy();
  
@@ -15793,6 +15977,8 @@ static SDValue LowerSEHRESTOREFRAME(SDValue Op, const X86Subtarget *Subtarget,
    Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP);
  
    // FIXME: Restore the base pointer in case of stack realignment!
+  if (RegInfo->needsStackRealignment(MF))
+    report_fatal_error("SEH with stack realignment not yet implemented");
  
    // Adjust EBP to point back to the original frame position.
    SDValue NewFP = recoverFramePointer(DAG, MF.getFunction(), IncomingEBP);
@@ -16774,6 +16960,38 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
    unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
      (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
  
+  auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
+    assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
+    MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
+    SDValue Ex = DAG.getBitcast(ExVT, R);
+
+    if (ShiftAmt >= 32) {
+      // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
+      SDValue Upper =
+          getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
+      SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
+                                                 ShiftAmt - 32, DAG);
+      if (VT == MVT::v2i64)
+        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
+      if (VT == MVT::v4i64)
+        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
+                                  {9, 1, 11, 3, 13, 5, 15, 7});
+    } else {
+      // SRA upper i32, SHL whole i64 and select lower i32.
+      SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
+                                                 ShiftAmt, DAG);
+      SDValue Lower =
+          getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
+      Lower = DAG.getBitcast(ExVT, Lower);
+      if (VT == MVT::v2i64)
+        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
+      if (VT == MVT::v4i64)
+        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
+                                  {8, 1, 10, 3, 12, 5, 14, 7});
+    }
+    return DAG.getBitcast(VT, Ex);
+  };
+
    // Optimize shl/srl/sra with constant shift amount.
    if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
      if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
@@ -16782,6 +17000,11 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
        if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
          return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
  
+      // i64 SRA needs to be performed as partial shifts.
+      if ((VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
+          Op.getOpcode() == ISD::SRA)
+        return ArithmeticShiftRight64(ShiftAmt);
+
        if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) {
          unsigned NumElts = VT.getVectorNumElements();
          MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
@@ -16865,7 +17088,12 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
        if (ShAmt != ShiftAmt)
          return SDValue();
      }
-    return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
+
+    if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
+      return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
+
+    if (Op.getOpcode() == ISD::SRA)
+      return ArithmeticShiftRight64(ShiftAmt);
    }
  
    return SDValue();
@@ -16947,7 +17175,9 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
          if (Vals[j] != Amt.getOperand(i + j))
            return SDValue();
      }
-    return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
+
+    if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
+      return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
    }
    return SDValue();
  }
@@ -18509,6 +18739,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::FMINC:              return "X86ISD::FMINC";
    case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
    case X86ISD::FRCP:               return "X86ISD::FRCP";
+  case X86ISD::EXTRQI:             return "X86ISD::EXTRQI";
+  case X86ISD::INSERTQI:           return "X86ISD::INSERTQI";
    case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
    case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
    case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
@@ -21676,16 +21908,16 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
      default: break;
      case ISD::SETULT:
      case ISD::SETULE:
-      Opc = hasUnsigned ? ISD::UMIN : 0u; break;
+      Opc = hasUnsigned ? ISD::UMIN : 0; break;
      case ISD::SETUGT:
      case ISD::SETUGE:
-      Opc = hasUnsigned ? ISD::UMAX : 0u; break;
+      Opc = hasUnsigned ? ISD::UMAX : 0; break;
      case ISD::SETLT:
      case ISD::SETLE:
-      Opc = hasSigned ? ISD::SMIN : 0u; break;
+      Opc = hasSigned ? ISD::SMIN : 0; break;
      case ISD::SETGT:
      case ISD::SETGE:
-      Opc = hasSigned ? ISD::SMAX : 0u; break;
+      Opc = hasSigned ? ISD::SMAX : 0; break;
      }
    // Check for x CC y ? y : x -- a min/max with reversed arms.
    } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
@@ -21694,16 +21926,16 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
      default: break;
      case ISD::SETULT:
      case ISD::SETULE:
-      Opc = hasUnsigned ? ISD::UMAX : 0u; break;
+      Opc = hasUnsigned ? ISD::UMAX : 0; break;
      case ISD::SETUGT:
      case ISD::SETUGE:
-      Opc = hasUnsigned ? ISD::UMIN : 0u; break;
+      Opc = hasUnsigned ? ISD::UMIN : 0; break;
      case ISD::SETLT:
      case ISD::SETLE:
-      Opc = hasSigned ? ISD::SMAX : 0u; break;
+      Opc = hasSigned ? ISD::SMAX : 0; break;
      case ISD::SETGT:
      case ISD::SETGE:
-      Opc = hasSigned ? ISD::SMIN : 0u; break;
+      Opc = hasSigned ? ISD::SMIN : 0; break;
      }
    }