[X86] Convert to MVT instead of calling EVT functions since we already know the type...

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 83c733a523d622561f9729b6a54febe2274b6339..c663a53c9948c088fceea6276bf17e728ea0d0eb 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -75,7 +75,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
  
    // Set up the TargetLowering object.
-  static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
  
    // X86 is weird. It always uses i8 for shift amounts and setcc results.
    setBooleanContents(ZeroOrOneBooleanContent);
@@ -270,8 +269,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    // (low) operations are left as Legal, as there are single-result
    // instructions for this in x86. Using the two-result multiply instructions
    // when both high and low results are needed must be arranged by dagcombine.
-  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
-    MVT VT = IntVTs[i];
+  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
      setOperationAction(ISD::MULHS, VT, Expand);
      setOperationAction(ISD::MULHU, VT, Expand);
      setOperationAction(ISD::SDIV, VT, Expand);
@@ -462,8 +460,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
  
    // Expand certain atomics
-  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
-    MVT VT = IntVTs[i];
+  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
      setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
      setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
      setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
@@ -861,14 +858,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      // ISD::CTTZ_ZERO_UNDEF v2i64 - scalarization is faster.
  
      // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
-    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
-      MVT VT = (MVT::SimpleValueType)i;
-      // Do not attempt to custom lower non-power-of-2 vectors
-      if (!isPowerOf2_32(VT.getVectorNumElements()))
-        continue;
-      // Do not attempt to custom lower non-128-bit vectors
-      if (!VT.is128BitVector())
-        continue;
+    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
        setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
        setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
        setOperationAction(ISD::VSELECT,            VT, Custom);
@@ -906,13 +896,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      }
  
      // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
-    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
-      MVT VT = (MVT::SimpleValueType)i;
-
-      // Do not attempt to promote non-128-bit vectors
-      if (!VT.is128BitVector())
-        continue;
-
+    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
        setOperationAction(ISD::AND,    VT, Promote);
        AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
        setOperationAction(ISD::OR,     VT, Promote);
@@ -1291,13 +1275,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
        setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
  
      // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
-    for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
-      MVT VT = (MVT::SimpleValueType)i;
-
-      // Do not attempt to promote non-256-bit vectors
-      if (!VT.is256BitVector())
-        continue;
-
+    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
        setOperationAction(ISD::AND,    VT, Promote);
        AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
        setOperationAction(ISD::OR,     VT, Promote);
@@ -1605,13 +1583,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
          setOperationAction(ISD::MSTORE,              VT, Legal);
        }
      }
-    for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
-      MVT VT = (MVT::SimpleValueType)i;
-
-      // Do not attempt to promote non-512-bit vectors.
-      if (!VT.is512BitVector())
-        continue;
-
+    for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
        setOperationAction(ISD::SELECT, VT, Promote);
        AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
      }
@@ -1652,6 +1624,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i16, Custom);
      setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i16, Custom);
      setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i16, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i8, Custom);
      setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
      setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i1, Custom);
@@ -1687,19 +1660,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
        setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v64i8,  Custom);
      }
  
-    for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
-      const MVT VT = (MVT::SimpleValueType)i;
-
-      const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
-
-      // Do not attempt to promote non-512-bit vectors.
-      if (!VT.is512BitVector())
-        continue;
-
-      if (EltSize < 32) {
-        setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
-        setOperationAction(ISD::VSELECT,             VT, Legal);
-      }
+    for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
+      setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
+      setOperationAction(ISD::VSELECT,             VT, Legal);
      }
    }
  
@@ -1752,9 +1715,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    // FIXME: We really should do custom legalization for addition and
    // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
    // than generic legalization for 64-bit multiplication-with-overflow, though.
-  for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
+  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+    if (VT == MVT::i64 && !Subtarget->is64Bit())
+      continue;
      // Add/Sub/Mul with overflow operations are custom lowered.
-    MVT VT = IntVTs[i];
      setOperationAction(ISD::SADDO, VT, Custom);
      setOperationAction(ISD::UADDO, VT, Custom);
      setOperationAction(ISD::SSUBO, VT, Custom);
@@ -1861,40 +1825,43 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
    if (!VT.isVector())
      return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
  
-  const unsigned NumElts = VT.getVectorNumElements();
-  const EVT EltVT = VT.getVectorElementType();
-  if (VT.is512BitVector()) {
-    if (Subtarget->hasAVX512())
-      if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
-          EltVT == MVT::f32 || EltVT == MVT::f64)
-        switch(NumElts) {
-        case  8: return MVT::v8i1;
-        case 16: return MVT::v16i1;
-      }
-    if (Subtarget->hasBWI())
-      if (EltVT == MVT::i8 || EltVT == MVT::i16)
-        switch(NumElts) {
-        case 32: return MVT::v32i1;
-        case 64: return MVT::v64i1;
-      }
-  }
+  if (VT.isSimple()) {
+    MVT VVT = VT.getSimpleVT();
+    const unsigned NumElts = VVT.getVectorNumElements();
+    const MVT EltVT = VVT.getVectorElementType();
+    if (VVT.is512BitVector()) {
+      if (Subtarget->hasAVX512())
+        if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
+            EltVT == MVT::f32 || EltVT == MVT::f64)
+          switch(NumElts) {
+          case  8: return MVT::v8i1;
+          case 16: return MVT::v16i1;
+        }
+      if (Subtarget->hasBWI())
+        if (EltVT == MVT::i8 || EltVT == MVT::i16)
+          switch(NumElts) {
+          case 32: return MVT::v32i1;
+          case 64: return MVT::v64i1;
+        }
+    }
  
-  if (VT.is256BitVector() || VT.is128BitVector()) {
-    if (Subtarget->hasVLX())
-      if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
-          EltVT == MVT::f32 || EltVT == MVT::f64)
-        switch(NumElts) {
-        case 2: return MVT::v2i1;
-        case 4: return MVT::v4i1;
-        case 8: return MVT::v8i1;
-      }
-    if (Subtarget->hasBWI() && Subtarget->hasVLX())
-      if (EltVT == MVT::i8 || EltVT == MVT::i16)
-        switch(NumElts) {
-        case  8: return MVT::v8i1;
-        case 16: return MVT::v16i1;
-        case 32: return MVT::v32i1;
-      }
+    if (VVT.is256BitVector() || VVT.is128BitVector()) {
+      if (Subtarget->hasVLX())
+        if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
+            EltVT == MVT::f32 || EltVT == MVT::f64)
+          switch(NumElts) {
+          case 2: return MVT::v2i1;
+          case 4: return MVT::v4i1;
+          case 8: return MVT::v8i1;
+        }
+      if (Subtarget->hasBWI() && Subtarget->hasVLX())
+        if (EltVT == MVT::i8 || EltVT == MVT::i16)
+          switch(NumElts) {
+          case  8: return MVT::v8i1;
+          case 16: return MVT::v16i1;
+          case 32: return MVT::v32i1;
+        }
+    }
    }
  
    return VT.changeVectorElementTypeToInteger();
@@ -2127,14 +2094,14 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
    return true;
  }
  
-/// Android provides a fixed TLS slot for the SafeStack pointer.
-/// See the definition of TLS_SLOT_SAFESTACK in
-/// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
-bool X86TargetLowering::getSafeStackPointerLocation(unsigned &AddressSpace,
-                                                    unsigned &Offset) const {
+Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
    if (!Subtarget->isTargetAndroid())
-    return false;
+    return TargetLowering::getSafeStackPointerLocation(IRB);
  
+  // Android provides a fixed TLS slot for the SafeStack pointer. See the
+  // definition of TLS_SLOT_SAFESTACK in
+  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+  unsigned AddressSpace, Offset;
    if (Subtarget->is64Bit()) {
      // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
      Offset = 0x48;
@@ -2147,7 +2114,10 @@ bool X86TargetLowering::getSafeStackPointerLocation(unsigned &AddressSpace,
      Offset = 0x24;
      AddressSpace = 256;
    }
-  return true;
+
+  return ConstantExpr::getIntToPtr(
+      ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
+      Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
  }
  
  bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
@@ -2988,7 +2958,7 @@ static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
  
  /// Returns a vector_shuffle mask for an movs{s|d}, movd
  /// operation of specified width.
-static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
+static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
                         SDValue V2) {
    unsigned NumElems = VT.getVectorNumElements();
    SmallVector<int, 8> Mask;
@@ -3874,7 +3844,7 @@ static bool isTargetShuffle(unsigned Opcode) {
    }
  }
  
-static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
+static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT,
                                      SDValue V1, unsigned TargetMask,
                                      SelectionDAG &DAG) {
    switch(Opc) {
@@ -3889,7 +3859,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
    }
  }
  
-static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
+static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT,
                                      SDValue V1, SDValue V2, SelectionDAG &DAG) {
    switch(Opc) {
    default: llvm_unreachable("Unknown x86 shuffle node");
@@ -4307,14 +4277,14 @@ bool X86::isZeroNode(SDValue Elt) {
  // Build a vector of constants
  // Use an UNDEF node if MaskElt == -1.
  // Spilt 64-bit constants in the 32-bit mode.
-static SDValue getConstVector(ArrayRef<int> Values, EVT VT,
+static SDValue getConstVector(ArrayRef<int> Values, MVT VT,
                                SelectionDAG &DAG,
                                SDLoc dl, bool IsMask = false) {
  
    SmallVector<SDValue, 32>  Ops;
    bool Split = false;
  
-  EVT ConstVecVT = VT;
+  MVT ConstVecVT = VT;
    unsigned NumElts = VT.getVectorNumElements();
    bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
    if (!In64BitMode && VT.getScalarType() == MVT::i64) {
@@ -4322,7 +4292,7 @@ static SDValue getConstVector(ArrayRef<int> Values, EVT VT,
      Split = true;
    }
  
-  EVT EltVT = ConstVecVT.getScalarType();
+  MVT EltVT = ConstVecVT.getVectorElementType();
    for (unsigned i = 0; i < NumElts; ++i) {
      bool IsUndef = Values[i] < 0 && IsMask;
      SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
@@ -4403,19 +4373,18 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
  
    // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
    unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
+  assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
  
    // This is the index of the first element of the vectorWidth-bit chunk
-  // we want.
-  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
-                               * ElemsPerChunk);
+  // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
+  IdxVal &= ~(ElemsPerChunk - 1);
  
    // If the input is a buildvector just emit a smaller one.
    if (Vec.getOpcode() == ISD::BUILD_VECTOR)
      return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
-                       makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
-                                    ElemsPerChunk));
+                       makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
  
-  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal, dl);
+  SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
  }
  
@@ -4453,13 +4422,13 @@ static SDValue InsertSubVector(SDValue Result, SDValue Vec,
  
    // Insert the relevant vectorWidth bits.
    unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
+  assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
  
    // This is the index of the first element of the vectorWidth-bit chunk
-  // we want.
-  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
-                               * ElemsPerChunk);
+  // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
+  IdxVal &= ~(ElemsPerChunk - 1);
  
-  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal, dl);
+  SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
  }
  
@@ -6892,22 +6861,62 @@ static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
  /// This doesn't do any checks for the availability of instructions for blending
  /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
  /// be matched in the backend with the type given. What it does check for is
-/// that the shuffle mask is in fact a blend.
+/// that the shuffle mask is a blend, or convertible into a blend with zero.
  static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
-                                         SDValue V2, ArrayRef<int> Mask,
+                                         SDValue V2, ArrayRef<int> Original,
                                           const X86Subtarget *Subtarget,
                                           SelectionDAG &DAG) {
+  bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
+  bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
+  SmallVector<int, 8> Mask(Original.begin(), Original.end());
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  bool ForceV1Zero = false, ForceV2Zero = false;
+
+  // Attempt to generate the binary blend mask. If an input is zero then
+  // we can use any lane.
+  // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
    unsigned BlendMask = 0;
    for (int i = 0, Size = Mask.size(); i < Size; ++i) {
-    if (Mask[i] >= Size) {
-      if (Mask[i] != i + Size)
-        return SDValue(); // Shuffled V2 input!
+    int M = Mask[i];
+    if (M < 0)
+      continue;
+    if (M == i)
+      continue;
+    if (M == i + Size) {
        BlendMask |= 1u << i;
        continue;
      }
-    if (Mask[i] >= 0 && Mask[i] != i)
-      return SDValue(); // Shuffled V1 input!
+    if (Zeroable[i]) {
+      if (V1IsZero) {
+        ForceV1Zero = true;
+        Mask[i] = i;
+        continue;
+      }
+      if (V2IsZero) {
+        ForceV2Zero = true;
+        BlendMask |= 1u << i;
+        Mask[i] = i + Size;
+        continue;
+      }
+    }
+    return SDValue(); // Shuffled input!
    }
+
+  // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
+  if (ForceV1Zero)
+    V1 = getZeroVector(VT, Subtarget, DAG, DL);
+  if (ForceV2Zero)
+    V2 = getZeroVector(VT, Subtarget, DAG, DL);
+
+  auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) {
+    unsigned ScaledMask = 0;
+    for (int i = 0; i != Size; ++i)
+      if (BlendMask & (1u << i))
+        for (int j = 0; j != Scale; ++j)
+          ScaledMask |= 1u << (i * Scale + j);
+    return ScaledMask;
+  };
+
    switch (VT.SimpleTy) {
    case MVT::v2f64:
    case MVT::v4f32:
@@ -6927,12 +6936,7 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
      if (Subtarget->hasAVX2()) {
        // Scale the blend by the number of 32-bit dwords per element.
        int Scale =  VT.getScalarSizeInBits() / 32;
-      BlendMask = 0;
-      for (int i = 0, Size = Mask.size(); i < Size; ++i)
-        if (Mask[i] >= Size)
-          for (int j = 0; j < Scale; ++j)
-            BlendMask |= 1u << (i * Scale + j);
-
+      BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
        MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
        V1 = DAG.getBitcast(BlendVT, V1);
        V2 = DAG.getBitcast(BlendVT, V2);
@@ -6945,12 +6949,7 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
      // For integer shuffles we need to expand the mask and cast the inputs to
      // v8i16s prior to blending.
      int Scale = 8 / VT.getVectorNumElements();
-    BlendMask = 0;
-    for (int i = 0, Size = Mask.size(); i < Size; ++i)
-      if (Mask[i] >= Size)
-        for (int j = 0; j < Scale; ++j)
-          BlendMask |= 1u << (i * Scale + j);
-
+    BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
      V1 = DAG.getBitcast(MVT::v8i16, V1);
      V2 = DAG.getBitcast(MVT::v8i16, V2);
      return DAG.getBitcast(VT,
@@ -6975,7 +6974,7 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
      // FALLTHROUGH
    case MVT::v16i8:
    case MVT::v32i8: {
-    assert((VT.getSizeInBits() == 128 || Subtarget->hasAVX2()) &&
+    assert((VT.is128BitVector() || Subtarget->hasAVX2()) &&
             "256-bit byte-blends require AVX2 support!");
  
      // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
@@ -7202,7 +7201,7 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
                          DAG.getConstant(Rotation * Scale, DL, MVT::i8)));
    }
  
-  assert(VT.getSizeInBits() == 128 &&
+  assert(VT.is128BitVector() &&
           "Rotate-based lowering only supports 128-bit lowering!");
    assert(Mask.size() <= 16 &&
           "Can shuffle at most 16 bytes in a 128-bit vector!");
@@ -7334,7 +7333,7 @@ static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1,
      // Determine the extraction length from the part of the
      // lower half that isn't zeroable.
      int Len = HalfSize;
-    for (; Len >= 0; --Len)
+    for (; Len > 0; --Len)
        if (!Zeroable[Len - 1])
          break;
      assert(Len > 0 && "Zeroable shuffle mask");
@@ -7350,7 +7349,7 @@ static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1,
        M = M % Size;
  
        // All mask elements must be in the lower half.
-      if (M > HalfSize)
+      if (M >= HalfSize)
          return SDValue();
  
        if (Idx < 0 || (Src == V && Idx == (M - i))) {
@@ -7490,7 +7489,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
    if (Subtarget->hasSSE41()) {
      // Not worth offseting 128-bit vectors if scale == 2, a pattern using
      // PUNPCK will catch this in a later shuffle match.
-    if (Offset && Scale == 2 && VT.getSizeInBits() == 128)
+    if (Offset && Scale == 2 && VT.is128BitVector())
        return SDValue();
      MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
                                   NumElements / Scale);
@@ -7498,7 +7497,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
      return DAG.getBitcast(VT, InputV);
    }
  
-  assert(VT.getSizeInBits() == 128 && "Only 128-bit vectors can be extended.");
+  assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
  
    // For any extends we can cheat for larger element sizes and use shuffle
    // instructions that can fold with a load and/or copy.
@@ -7528,7 +7527,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
    // to 64-bits.
    if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) {
      assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
-    assert(VT.getSizeInBits() == 128 && "Unexpected vector width!");
+    assert(VT.is128BitVector() && "Unexpected vector width!");
  
      int LoIdx = Offset * EltBits;
      SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
@@ -9917,7 +9916,7 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
                                                         ArrayRef<int> Mask,
                                                         SelectionDAG &DAG) {
    // FIXME: This should probably be generalized for 512-bit vectors as well.
-  assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!");
+  assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
    int LaneSize = Mask.size() / 2;
  
    // If there are only inputs from one 128-bit lane, splitting will in fact be
@@ -10732,16 +10731,15 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
  /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
  static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT,
-                                             ArrayRef<int> Mask,
-                                             SDValue V1, SDValue V2,
-                                             SelectionDAG &DAG) {
+                                        ArrayRef<int> Mask,
+                                        SDValue V1, SDValue V2,
+                                        SelectionDAG &DAG) {
    assert(VT.getScalarSizeInBits() == 64 &&
           "Unexpected element type size for 128bit shuffle.");
  
    // To handle 256 bit vector requires VLX and most probably
    // function lowerV2X128VectorShuffle() is better solution.
-  assert(VT.getSizeInBits() == 512 &&
-         "Unexpected vector size for 128bit shuffle.");
+  assert(VT.is512BitVector() && "Unexpected vector size for 128bit shuffle.");
  
    SmallVector<int, 4> WidenedMask;
    if (!canWidenShuffleElements(Mask, WidenedMask))
@@ -10806,8 +10804,8 @@ static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
  /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
  static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                       const X86Subtarget *Subtarget,
-                                       SelectionDAG &DAG) {
+                                        const X86Subtarget *Subtarget,
+                                        SelectionDAG &DAG) {
    SDLoc DL(Op);
    assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
    assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
@@ -10846,8 +10844,8 @@ static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
  /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
  static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                       const X86Subtarget *Subtarget,
-                                       SelectionDAG &DAG) {
+                                        const X86Subtarget *Subtarget,
+                                        SelectionDAG &DAG) {
    SDLoc DL(Op);
    assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
    assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
@@ -10954,11 +10952,10 @@ static SDValue lower1BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    ArrayRef<int> Mask = SVOp->getMask();
    assert(Subtarget->hasAVX512() &&
           "Cannot lower 512-bit vectors w/o basic ISA!");
-  EVT ExtVT;
+  MVT ExtVT;
    switch (VT.SimpleTy) {
    default:
-    assert(false && "Expected a vector of i1 elements");
-    break;
+    llvm_unreachable("Expected a vector of i1 elements");
    case MVT::v2i1:
      ExtVT = MVT::v2i64;
      break;
@@ -11122,13 +11119,13 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
    }
  
    // For each vector width, delegate to a specialized lowering routine.
-  if (VT.getSizeInBits() == 128)
+  if (VT.is128BitVector())
      return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
  
-  if (VT.getSizeInBits() == 256)
+  if (VT.is256BitVector())
      return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
  
-  if (VT.getSizeInBits() == 512)
+  if (VT.is512BitVector())
      return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
  
    if (Is1BitVector)
@@ -11392,10 +11389,11 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
      MVT EltVT = VecVT.getVectorElementType();
  
      unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
+    assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
  
-    //if (IdxVal >= NumElems/2)
-    //  IdxVal -= NumElems/2;
-    IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
+    // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
+    // this can be done with a mask.
+    IdxVal &= ElemsPerChunk - 1;
      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
                         DAG.getConstant(IdxVal, dl, MVT::i32));
    }
@@ -11531,7 +11529,9 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
  
      // Insert the element into the desired chunk.
      unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
-    unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128;
+    assert(isPowerOf2_32(NumEltsIn128));
+    // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
+    unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
  
      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
                      DAG.getConstant(IdxIn128, dl, MVT::i32));
@@ -14436,7 +14436,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
    // Since SSE has no unsigned integer comparisons, we need to flip the sign
    // bits of the inputs before performing those operations.
    if (FlipSigns) {
-    EVT EltVT = VT.getVectorElementType();
+    MVT EltVT = VT.getVectorElementType();
      SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl,
                                   VT);
      Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
@@ -15129,7 +15129,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
    // memory. In practice, we ''widen'' MemVT.
    EVT WideVecVT =
        EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
-                       loadRegZize / MemVT.getScalarType().getSizeInBits());
+                       loadRegZize / MemVT.getScalarSizeInBits());
  
    assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
           "Invalid vector type");
@@ -15923,22 +15923,22 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
      }
  
      switch (Op.getOpcode()) {
-      default: break;
-      case X86ISD::PCMPEQM:
-      case X86ISD::PCMPGTM:
-      case X86ISD::CMPM:
-      case X86ISD::CMPMU:
-        return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
-      case X86ISD::VFPCLASS:
-        return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
-      case X86ISD::VTRUNC:
-      case X86ISD::VTRUNCS:
-      case X86ISD::VTRUNCUS:
-        // We can't use ISD::VSELECT here because it is not always "Legal"
-        // for the destination type. For example vpmovqb require only AVX512
-        // and vselect that can operate on byte element type require BWI
-        OpcodeSelect = X86ISD::SELECT;
-        break;
+    default: break;
+    case X86ISD::PCMPEQM:
+    case X86ISD::PCMPGTM:
+    case X86ISD::CMPM:
+    case X86ISD::CMPMU:
+      return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
+    case X86ISD::VFPCLASS:
+      return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
+    case X86ISD::VTRUNC:
+    case X86ISD::VTRUNCS:
+    case X86ISD::VTRUNCUS:
+      // We can't use ISD::VSELECT here because it is not always "Legal"
+      // for the destination type. For example vpmovqb require only AVX512
+      // and vselect that can operate on byte element type require BWI
+      OpcodeSelect = X86ISD::SELECT;
+      break;
      }
      if (PreservedSrc.getOpcode() == ISD::UNDEF)
        PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
@@ -16665,7 +16665,7 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
      llvm_unreachable("Valid scale values are 1, 2, 4, 8");
  
    SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
-  EVT MaskVT = MVT::getVectorVT(MVT::i1,
+  MVT MaskVT = MVT::getVectorVT(MVT::i1,
                               Index.getSimpleValueType().getVectorNumElements());
    SDValue MaskInReg;
    ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
@@ -16706,7 +16706,7 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
    SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
    SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
    SDValue Segment = DAG.getRegister(0, MVT::i32);
-  EVT MaskVT = MVT::getVectorVT(MVT::i1,
+  MVT MaskVT = MVT::getVectorVT(MVT::i1,
                               Index.getSimpleValueType().getVectorNumElements());
    SDValue MaskInReg;
    ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
@@ -16737,7 +16737,7 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
    SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
    SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
    SDValue Segment = DAG.getRegister(0, MVT::i32);
-  EVT MaskVT =
+  MVT MaskVT =
      MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
    SDValue MaskInReg;
    ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
@@ -18237,7 +18237,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
  
    if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
      SDValue BaseShAmt;
-    EVT EltVT = VT.getVectorElementType();
+    MVT EltVT = VT.getVectorElementType();
  
      if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
        // Check if this build_vector node is doing a splat.
@@ -18373,7 +18373,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
      SmallVector<SDValue, 8> Elts;
      EVT SVT = VT.getScalarType();
      unsigned SVTBits = SVT.getSizeInBits();
-    const APInt &One = APInt(SVTBits, 1);
+    APInt One(SVTBits, 1);
      unsigned NumElems = VT.getVectorNumElements();
  
      for (unsigned i=0; i !=NumElems; ++i) {
@@ -18384,7 +18384,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
        }
  
        ConstantSDNode *ND = cast<ConstantSDNode>(Op);
-      const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue());
+      APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
        uint64_t ShAmt = C.getZExtValue();
        if (ShAmt >= SVTBits) {
          Elts.push_back(DAG.getUNDEF(SVT));
@@ -18463,7 +18463,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
      if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
          isa<ConstantSDNode>(Amt2)) {
        // Replace this node with two shifts followed by a MOVSS/MOVSD.
-      EVT CastVT = MVT::v4i32;
+      MVT CastVT = MVT::v4i32;
        SDValue Splat1 =
          DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
        SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
@@ -18731,7 +18731,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
    if (VT.is256BitVector()) {
      unsigned NumElems = VT.getVectorNumElements();
      MVT EltVT = VT.getVectorElementType();
-    EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+    MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
  
      // Extract the two vectors
      SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
@@ -18779,10 +18779,10 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget *Subtarget,
    // +ve/-ve Amt = rotate left/right.
  
    // Split 256-bit integers.
-  if (VT.getSizeInBits() == 256)
+  if (VT.is256BitVector())
      return Lower256IntArith(Op, DAG);
  
-  assert(VT.getSizeInBits() == 128 && "Only rotate 128-bit vectors!");
+  assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
  
    // Attempt to rotate by immediate.
    if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
@@ -19100,7 +19100,7 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
      SDValue InVec = Op->getOperand(0);
      SDLoc dl(Op);
      unsigned NumElts = SrcVT.getVectorNumElements();
-    EVT SVT = SrcVT.getVectorElementType();
+    MVT SVT = SrcVT.getVectorElementType();
  
      // Widen the vector in input in the case of MVT::v2i32.
      // Example: from MVT::v2i32 to MVT::v4i32.
@@ -20324,11 +20324,11 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
      return false;
  
    // Not for i1 vectors
-  if (VT.getScalarType() == MVT::i1)
+  if (VT.getSimpleVT().getScalarType() == MVT::i1)
      return false;
  
    // Very little shuffling can be done for 64-bit vectors right now.
-  if (VT.getSizeInBits() == 64)
+  if (VT.getSimpleVT().getSizeInBits() == 64)
      return false;
  
    // We only care that the types being shuffled are legal. The lowering can
@@ -22032,7 +22032,7 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
    unsigned Depth) const {
    // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
    if (Op.getOpcode() == X86ISD::SETCC_CARRY)
-    return Op.getValueType().getScalarType().getSizeInBits();
+    return Op.getValueType().getScalarSizeInBits();
  
    // Fallback case.
    return 1;
@@ -22236,7 +22236,7 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
    // doesn't preclude something switching to the shorter encoding post-RA.
    //
    // FIXME: Should teach these routines about AVX vector widths.
-  if (FloatDomain && VT.getSizeInBits() == 128) {
+  if (FloatDomain && VT.is128BitVector()) {
      if (Mask.equals({0, 0}) || Mask.equals({1, 1})) {
        bool Lo = Mask.equals({0, 0});
        unsigned Shuffle;
@@ -22300,7 +22300,7 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
    // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
    // variants as none of these have single-instruction variants that are
    // superior to the UNPCK formulation.
-  if (!FloatDomain && VT.getSizeInBits() == 128 &&
+  if (!FloatDomain && VT.is128BitVector() &&
        (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
         Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) ||
         Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) ||
@@ -23120,21 +23120,45 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
                       EltNo);
  }
  
-/// \brief Detect bitcasts between i32 to x86mmx low word. Since MMX types are
-/// special and don't usually play with other vector types, it's better to
-/// handle them early to be sure we emit efficient code by avoiding
-/// store-load conversions.
-static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) {
-  if (N->getValueType(0) != MVT::x86mmx ||
-      N->getOperand(0)->getOpcode() != ISD::BUILD_VECTOR ||
-      N->getOperand(0)->getValueType(0) != MVT::v2i32)
-    return SDValue();
+static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG,
+                                     const X86Subtarget *Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
  
-  SDValue V = N->getOperand(0);
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(1));
-  if (C && C->getZExtValue() == 0 && V.getOperand(0).getValueType() == MVT::i32)
-    return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(V.getOperand(0)),
-                       N->getValueType(0), V.getOperand(0));
+  // Detect bitcasts between i32 to x86mmx low word. Since MMX types are
+  // special and don't usually play with other vector types, it's better to
+  // handle them early to be sure we emit efficient code by avoiding
+  // store-load conversions.
+  if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
+      N0.getValueType() == MVT::v2i32 &&
+      isa<ConstantSDNode>(N0.getOperand(1))) {
+    SDValue N00 = N0->getOperand(0);
+    if (N0.getConstantOperandVal(1) == 0 && N00.getValueType() == MVT::i32)
+      return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
+  }
+
+  // Convert a bitcasted integer logic operation that has one bitcasted
+  // floating-point operand and one constant operand into a floating-point
+  // logic operation. This may create a load of the constant, but that is
+  // cheaper than materializing the constant in an integer register and
+  // transferring it to an SSE register or transferring the SSE operand to
+  // integer register and back.
+  unsigned FPOpcode;
+  switch (N0.getOpcode()) {
+    case ISD::AND: FPOpcode = X86ISD::FAND; break;
+    case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
+    case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
+    default: return SDValue();
+  }
+  if (((Subtarget->hasSSE1() && VT == MVT::f32) ||
+       (Subtarget->hasSSE2() && VT == MVT::f64)) &&
+      isa<ConstantSDNode>(N0.getOperand(1)) &&
+      N0.getOperand(0).getOpcode() == ISD::BITCAST &&
+      N0.getOperand(0).getOperand(0).getValueType() == VT) {
+    SDValue N000 = N0.getOperand(0).getOperand(0);
+    SDValue FPConst = DAG.getBitcast(VT, N0.getOperand(1));
+    return DAG.getNode(FPOpcode, SDLoc(N0), VT, N000, FPConst);
+  }
  
    return SDValue();
  }
@@ -23773,7 +23797,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
    if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
        !DCI.isBeforeLegalize() &&
        !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
-    unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
+    unsigned BitWidth = Cond.getValueType().getScalarSizeInBits();
  
      // Don't optimize vector selects that map to mask-registers.
      if (BitWidth == 1)
@@ -23797,11 +23821,10 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
      if (VT.getScalarType() == MVT::i16)
        return SDValue();
      // Dynamic blending was only available from SSE4.1 onward.
-    if (VT.getSizeInBits() == 128 && !Subtarget->hasSSE41())
+    if (VT.is128BitVector() && !Subtarget->hasSSE41())
        return SDValue();
      // Byte blends are only available in AVX2
-    if (VT.getSizeInBits() == 256 && VT.getScalarType() == MVT::i8 &&
-        !Subtarget->hasAVX2())
+    if (VT == MVT::v32i8 && !Subtarget->hasAVX2())
        return SDValue();
  
      assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
@@ -24384,7 +24407,8 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
    if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
      if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
        APInt ShiftAmt = AmtSplat->getAPIntValue();
-      unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
+      unsigned MaxAmount =
+        VT.getSimpleVT().getVectorElementType().getSizeInBits();
  
        // SSE2/AVX2 logical shifts always return a vector of 0s
        // if the shift amount is bigger than or equal to
@@ -24615,9 +24639,9 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
    case ISD::ANY_EXTEND:
      return Op;
    case ISD::ZERO_EXTEND: {
-    unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
+    unsigned InBits = NarrowVT.getScalarSizeInBits();
      APInt Mask = APInt::getAllOnesValue(InBits);
-    Mask = Mask.zext(VT.getScalarType().getSizeInBits());
+    Mask = Mask.zext(VT.getScalarSizeInBits());
      return DAG.getNode(ISD::AND, DL, VT,
                         Op, DAG.getConstant(Mask, DL, VT));
    }
@@ -26001,7 +26025,7 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
      }
    }
  
-  if (Subtarget->hasAVX() && VT.isVector() && VT.getSizeInBits() == 256)
+  if (Subtarget->hasAVX() && VT.is256BitVector())
      if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
        return R;
  
@@ -26632,7 +26656,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case ISD::SELECT:
    case X86ISD::SHRUNKBLEND:
      return PerformSELECTCombine(N, DAG, DCI, Subtarget);
-  case ISD::BITCAST:        return PerformBITCASTCombine(N, DAG);
+  case ISD::BITCAST:        return PerformBITCASTCombine(N, DAG, Subtarget);
    case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
    case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
    case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
@@ -27492,3 +27516,27 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
                                     Attribute::MinSize);
    return OptSize && !VT.isVector();
  }
+
+void X86TargetLowering::markInRegArguments(SelectionDAG &DAG,
+       TargetLowering::ArgListTy& Args) const {
+  // The MCU psABI requires some arguments to be passed in-register.
+  // For regular calls, the inreg arguments are marked by the front-end.
+  // However, for compiler generated library calls, we have to patch this
+  // up here.
+  if (!Subtarget->isTargetMCU() || !Args.size())
+    return;
+
+  unsigned FreeRegs = 3;
+  for (auto &Arg : Args) {
+    // For library functions, we do not expect any fancy types.
+    unsigned Size = DAG.getDataLayout().getTypeSizeInBits(Arg.Ty);
+    unsigned SizeInRegs = (Size + 31) / 32;
+    if (SizeInRegs > 2 || SizeInRegs > FreeRegs)
+      continue;
+
+    Arg.isInReg = true;
+    FreeRegs -= SizeInRegs;
+    if (!FreeRegs)
+      break;
+  }
+}