[X86][SSE] Add general memory folding for (V)INSERTPS instruction

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 1f5b4d964770c84fd4b091ea688d78dc0e915d32..685067a28c60de023e74ff418e7878ddabfd4738 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -3959,7 +3959,6 @@ static bool isX86CCUnsigned(unsigned X86CC) {
    case X86::COND_BE:    return true;
    case X86::COND_AE:    return true;
    }
-  llvm_unreachable("covered switch fell through?!");
  }
  
  /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
@@ -4207,8 +4206,8 @@ bool X86::isVEXTRACT256Index(SDNode *N) {
  
  static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
    assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
-  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
-    llvm_unreachable("Illegal extract subvector for VEXTRACT");
+  assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
+         "Illegal extract subvector for VEXTRACT");
  
    uint64_t Index =
      cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
@@ -4222,8 +4221,8 @@ static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
  
  static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
    assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
-  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
-    llvm_unreachable("Illegal insert subvector for VINSERT");
+  assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
+         "Illegal insert subvector for VINSERT");
  
    uint64_t Index =
      cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
@@ -12657,11 +12656,10 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
      return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);
    case MVT::v16i8:
    case MVT::v16i16:
-    if (Subtarget->hasAVX512())
-      return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
-                         DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
+    assert(Subtarget->hasAVX512());
+    return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
+                       DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
    }
-  llvm_unreachable(nullptr);
  }
  
  SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
@@ -13775,8 +13773,8 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
      if (C->getAPIntValue() == 0)
        return EmitTest(Op0, X86CC, dl, DAG);
  
-     if (Op0.getValueType() == MVT::i1)
-       llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
+     assert(Op0.getValueType() != MVT::i1 &&
+            "Unexpected comparison operation for MVT::i1 operands");
    }
  
    if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
@@ -16653,13 +16651,7 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                SDValue Index, SDValue ScaleOp, SDValue Chain,
                                const X86Subtarget * Subtarget) {
    SDLoc dl(Op);
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
-  if (!C)
-    llvm_unreachable("Invalid scale type");
-  unsigned ScaleVal = C->getZExtValue();
-  if (ScaleVal > 2 && ScaleVal != 4 && ScaleVal != 8)
-    llvm_unreachable("Valid scale values are 1, 2, 4, 8");
-
+  auto *C = cast<ConstantSDNode>(ScaleOp);
    SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
    MVT MaskVT = MVT::getVectorVT(MVT::i1,
                               Index.getSimpleValueType().getVectorNumElements());
@@ -16692,13 +16684,7 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                 SDValue Src, SDValue Mask, SDValue Base,
                                 SDValue Index, SDValue ScaleOp, SDValue Chain) {
    SDLoc dl(Op);
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
-  if (!C)
-    llvm_unreachable("Invalid scale type");
-  unsigned ScaleVal = C->getZExtValue();
-  if (ScaleVal > 2 && ScaleVal != 4 && ScaleVal != 8)
-    llvm_unreachable("Valid scale values are 1, 2, 4, 8");
-
+  auto *C = cast<ConstantSDNode>(ScaleOp);
    SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
    SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
    SDValue Segment = DAG.getRegister(0, MVT::i32);
@@ -16728,8 +16714,7 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                 SDValue Mask, SDValue Base, SDValue Index,
                                 SDValue ScaleOp, SDValue Chain) {
    SDLoc dl(Op);
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
-  assert(C && "Invalid scale type");
+  auto *C = cast<ConstantSDNode>(ScaleOp);
    SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
    SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
    SDValue Segment = DAG.getRegister(0, MVT::i32);
@@ -16965,9 +16950,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
  
    SDLoc dl(Op);
    switch(IntrData->Type) {
-  default:
-    llvm_unreachable("Unknown Intrinsic Type");
-    break;
+  default: llvm_unreachable("Unknown Intrinsic Type");
    case RDSEED:
    case RDRAND: {
      // Emit the node with the right value type.
@@ -19697,8 +19680,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
    case X86ISD::FMAXC:
    case X86ISD::FMAX: {
      EVT VT = N->getValueType(0);
-    if (VT != MVT::v2f32)
-      llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX.");
+    assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
      SDValue UNDEF = DAG.getUNDEF(VT);
      SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
                                N->getOperand(0), UNDEF);
@@ -23195,9 +23177,9 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
  
    EVT VT = N->getValueType(0);
  
-  if (VT == MVT::i1 && dyn_cast<ConstantSDNode>(N->getOperand(1)) &&
+  if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) &&
        InputVector.getOpcode() == ISD::BITCAST &&
-      dyn_cast<ConstantSDNode>(InputVector.getOperand(0))) {
+      isa<ConstantSDNode>(InputVector.getOperand(0))) {
      uint64_t ExtractedElt =
          cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
      uint64_t InputValue =
@@ -26192,52 +26174,6 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
    return SDValue();
  }
  
-static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
-                                         SelectionDAG &DAG) {
-  SDLoc dl(Load);
-  MVT VT = Load->getSimpleValueType(0);
-  MVT EVT = VT.getVectorElementType();
-  SDValue Addr = Load->getOperand(1);
-  SDValue NewAddr = DAG.getNode(
-      ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
-      DAG.getConstant(Index * EVT.getStoreSize(), dl,
-                      Addr.getSimpleValueType()));
-
-  SDValue NewLoad =
-      DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
-                  DAG.getMachineFunction().getMachineMemOperand(
-                      Load->getMemOperand(), 0, EVT.getStoreSize()));
-  return NewLoad;
-}
-
-static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
-                                      const X86Subtarget *Subtarget) {
-  SDLoc dl(N);
-  MVT VT = N->getOperand(1)->getSimpleValueType(0);
-  assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
-         "X86insertps is only defined for v4x32");
-
-  SDValue Ld = N->getOperand(1);
-  if (MayFoldLoad(Ld)) {
-    // Extract the countS bits from the immediate so we can get the proper
-    // address when narrowing the vector load to a specific element.
-    // When the second source op is a memory address, insertps doesn't use
-    // countS and just gets an f32 from that address.
-    unsigned DestIndex =
-        cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
-
-    Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
-
-    // Create this as a scalar to vector to match the instruction pattern.
-    SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
-    // countS bits are ignored when loading from memory on insertps, which
-    // means we don't need to explicitly set them to 0.
-    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
-                       LoadScalarToVector, N->getOperand(2));
-  }
-  return SDValue();
-}
-
  static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) {
    SDValue V0 = N->getOperand(0);
    SDValue V1 = N->getOperand(1);
@@ -26437,8 +26373,8 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
    }
  
    // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
-  // a 32-bit target where SSE doesn't support i64->FP operations.
-  if (Op0.getOpcode() == ISD::LOAD) {
+  // a 32-bit target where SSE doesn't support i64->FP operations.  
+  if (!Subtarget->useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
      LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
      EVT LdVT = Ld->getValueType(0);
  
@@ -26703,11 +26639,6 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case X86ISD::VPERM2X128:
    case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
    case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
-  case X86ISD::INSERTPS: {
-    if (getTargetMachine().getOptLevel() > CodeGenOpt::None)
-      return PerformINSERTPSCombine(N, DAG, Subtarget);
-    break;
-  }
    case X86ISD::BLENDI:    return PerformBLENDICombine(N, DAG);
    }