[SystemZ] Fix expansion of ISD::FPOW and ISD::FSINCOS

[oota-llvm.git] / lib / Target / SystemZ / SystemZISelLowering.cpp
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp

index 5f547439c9aab351ffcce3b3a1d2ae82d999b341..9ef7ba248107d4a70af4d43e172eeaebe505ce92 100644 (file)
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -81,19 +81,24 @@ static MachineOperand earlyUseOperand(MachineOperand Op) {
    return Op;
  }
  
-SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
+SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
                                               const SystemZSubtarget &STI)
-    : TargetLowering(tm), Subtarget(STI) {
-  MVT PtrVT = getPointerTy();
+    : TargetLowering(TM), Subtarget(STI) {
+  MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
  
    // Set up the register classes.
    if (Subtarget.hasHighWord())
      addRegisterClass(MVT::i32, &SystemZ::GRX32BitRegClass);
    else
      addRegisterClass(MVT::i32, &SystemZ::GR32BitRegClass);
-  addRegisterClass(MVT::i64,  &SystemZ::GR64BitRegClass);
-  addRegisterClass(MVT::f32,  &SystemZ::FP32BitRegClass);
-  addRegisterClass(MVT::f64,  &SystemZ::FP64BitRegClass);
+  addRegisterClass(MVT::i64, &SystemZ::GR64BitRegClass);
+  if (Subtarget.hasVector()) {
+    addRegisterClass(MVT::f32, &SystemZ::VR32BitRegClass);
+    addRegisterClass(MVT::f64, &SystemZ::VR64BitRegClass);
+  } else {
+    addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass);
+    addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass);
+  }
    addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);
  
    if (Subtarget.hasVector()) {
@@ -101,6 +106,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
      addRegisterClass(MVT::v8i16, &SystemZ::VR128BitRegClass);
      addRegisterClass(MVT::v4i32, &SystemZ::VR128BitRegClass);
      addRegisterClass(MVT::v2i64, &SystemZ::VR128BitRegClass);
+    addRegisterClass(MVT::v4f32, &SystemZ::VR128BitRegClass);
      addRegisterClass(MVT::v2f64, &SystemZ::VR128BitRegClass);
    }
  
@@ -275,7 +281,8 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
      if (isTypeLegal(VT)) {
        // These operations are legal for anything that can be stored in a
        // vector register, even if there is no native support for the format
-      // as such.
+      // as such.  In particular, we can do these for v4f32 even though there
+      // are no specific instructions for that format.
        setOperationAction(ISD::LOAD, VT, Legal);
        setOperationAction(ISD::STORE, VT, Legal);
        setOperationAction(ISD::VSELECT, VT, Legal);
@@ -311,6 +318,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
        // Convert a GPR scalar to a vector by inserting it into element 0.
        setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
  
+      // Use a series of unpacks for extensions.
+      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
+      setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
+
        // Detect shifts by a scalar amount and convert them into
        // V*_BY_SCALAR.
        setOperationAction(ISD::SHL, VT, Custom);
@@ -358,18 +369,23 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
        // No special instructions for these.
        setOperationAction(ISD::FSIN, VT, Expand);
        setOperationAction(ISD::FCOS, VT, Expand);
+      setOperationAction(ISD::FSINCOS, VT, Expand);
        setOperationAction(ISD::FREM, VT, Expand);
+      setOperationAction(ISD::FPOW, VT, Expand);
      }
    }
  
    // Handle floating-point vector types.
    if (Subtarget.hasVector()) {
      // Scalar-to-vector conversion is just a subreg.
+    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
      setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
  
      // Some insertions and extractions can be done directly but others
      // need to go via integers.
+    setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
      setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
  
      // These operations have direct equivalents.
@@ -407,8 +423,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
  
    // We have 64-bit FPR<->GPR moves, but need special handling for
    // 32-bit forms.
-  setOperationAction(ISD::BITCAST, MVT::i32, Custom);
-  setOperationAction(ISD::BITCAST, MVT::f32, Custom);
+  if (!Subtarget.hasVector()) {
+    setOperationAction(ISD::BITCAST, MVT::i32, Custom);
+    setOperationAction(ISD::BITCAST, MVT::f32, Custom);
+  }
  
    // VASTART and VACOPY need to deal with the SystemZ-specific varargs
    // structure, but VAEND is a no-op.
@@ -420,9 +438,11 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
    setTargetDAGCombine(ISD::SIGN_EXTEND);
    setTargetDAGCombine(ISD::STORE);
    setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::FP_ROUND);
  
    // Handle intrinsics.
    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  
    // We want to use MVC in preference to even a single load/store pair.
    MaxStoresPerMemcpy = 0;
@@ -437,7 +457,8 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
    MaxStoresPerMemsetOptSize = 0;
  }
  
-EVT SystemZTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+EVT SystemZTargetLowering::getSetCCResultType(const DataLayout &DL,
+                                              LLVMContext &, EVT VT) const {
    if (!VT.isVector())
      return MVT::i32;
    return VT.changeVectorElementTypeToInteger();
@@ -488,9 +509,10 @@ bool SystemZTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
      *Fast = true;
    return true;
  }
-  
-bool SystemZTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                                  Type *Ty) const {
+
+bool SystemZTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                                  const AddrMode &AM, Type *Ty,
+                                                  unsigned AS) const {
    // Punt on globals for now, although they can be used in limited
    // RELATIVE LONG cases.
    if (AM.BaseGV)
@@ -525,7 +547,7 @@ bool SystemZTargetLowering::isTruncateFree(EVT FromVT, EVT ToVT) const {
  //===----------------------------------------------------------------------===//
  
  TargetLowering::ConstraintType
-SystemZTargetLowering::getConstraintType(const std::string &Constraint) const {
+SystemZTargetLowering::getConstraintType(StringRef Constraint) const {
    if (Constraint.size() == 1) {
      switch (Constraint[0]) {
      case 'a': // Address register
@@ -622,13 +644,14 @@ getSingleConstraintMatchWeight(AsmOperandInfo &info,
  // has already been verified.  MC is the class associated with "t" and
  // Map maps 0-based register numbers to LLVM register numbers.
  static std::pair<unsigned, const TargetRegisterClass *>
-parseRegisterNumber(const std::string &Constraint,
-                    const TargetRegisterClass *RC, const unsigned *Map) {
+parseRegisterNumber(StringRef Constraint, const TargetRegisterClass *RC,
+                    const unsigned *Map) {
    assert(*(Constraint.end()-1) == '}' && "Missing '}'");
    if (isdigit(Constraint[2])) {
-    std::string Suffix(Constraint.data() + 2, Constraint.size() - 2);
-    unsigned Index = atoi(Suffix.c_str());
-    if (Index < 16 && Map[Index])
+    unsigned Index;
+    bool Failed =
+        Constraint.slice(2, Constraint.size() - 1).getAsInteger(10, Index);
+    if (!Failed && Index < 16 && Map[Index])
        return std::make_pair(Map[Index], RC);
    }
    return std::make_pair(0U, nullptr);
@@ -636,8 +659,7 @@ parseRegisterNumber(const std::string &Constraint,
  
  std::pair<unsigned, const TargetRegisterClass *>
  SystemZTargetLowering::getRegForInlineAsmConstraint(
-    const TargetRegisterInfo *TRI, const std::string &Constraint,
-    MVT VT) const {
+    const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
    if (Constraint.size() == 1) {
      // GCC Constraint Letters
      switch (Constraint[0]) {
@@ -668,7 +690,7 @@ SystemZTargetLowering::getRegForInlineAsmConstraint(
        return std::make_pair(0U, &SystemZ::FP32BitRegClass);
      }
    }
-  if (Constraint[0] == '{') {
+  if (Constraint.size() > 0 && Constraint[0] == '{') {
      // We need to override the default register parsing for GPRs and FPRs
      // because the interpretation depends on VT.  The internal names of
      // the registers are also different from the external names
@@ -760,6 +782,24 @@ bool SystemZTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
    return true;
  }
  
+// We do not yet support 128-bit single-element vector types.  If the user
+// attempts to use such types as function argument or return type, prefer
+// to error out instead of emitting code violating the ABI.
+static void VerifyVectorType(MVT VT, EVT ArgVT) {
+  if (ArgVT.isVector() && !VT.isVector())
+    report_fatal_error("Unsupported vector argument or return type");
+}
+
+static void VerifyVectorTypes(const SmallVectorImpl<ISD::InputArg> &Ins) {
+  for (unsigned i = 0; i < Ins.size(); ++i)
+    VerifyVectorType(Ins[i].VT, Ins[i].ArgVT);
+}
+
+static void VerifyVectorTypes(const SmallVectorImpl<ISD::OutputArg> &Outs) {
+  for (unsigned i = 0; i < Outs.size(); ++i)
+    VerifyVectorType(Outs[i].VT, Outs[i].ArgVT);
+}
+
  // Value is a value that has been passed to us in the location described by VA
  // (and so has type VA.getLocVT()).  Convert Value to VA.getValVT(), chaining
  // any loads onto Chain.
@@ -780,7 +820,15 @@ static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDLoc DL,
    else if (VA.getLocInfo() == CCValAssign::Indirect)
      Value = DAG.getLoad(VA.getValVT(), DL, Chain, Value,
                          MachinePointerInfo(), false, false, false, 0);
-  else
+  else if (VA.getLocInfo() == CCValAssign::BCvt) {
+    // If this is a short vector argument loaded from the stack,
+    // extend from i64 to full vector size and then bitcast.
+    assert(VA.getLocVT() == MVT::i64);
+    assert(VA.getValVT().isVector());
+    Value = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i64,
+                        Value, DAG.getUNDEF(MVT::i64));
+    Value = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Value);
+  } else
      assert(VA.getLocInfo() == CCValAssign::Full && "Unsupported getLocInfo");
    return Value;
  }
@@ -797,6 +845,14 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDLoc DL,
      return DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Value);
    case CCValAssign::AExt:
      return DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Value);
+  case CCValAssign::BCvt:
+    // If this is a short vector argument to be stored to the stack,
+    // bitcast to v2i64 and then extract first element.
+    assert(VA.getLocVT() == MVT::i64);
+    assert(VA.getValVT().isVector());
+    Value = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Value);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VA.getLocVT(), Value,
+                       DAG.getConstant(0, DL, MVT::i32));
    case CCValAssign::Full:
      return Value;
    default:
@@ -817,6 +873,10 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
    auto *TFL =
        static_cast<const SystemZFrameLowering *>(Subtarget.getFrameLowering());
  
+  // Detect unsupported vector argument types.
+  if (Subtarget.hasVector())
+    VerifyVectorTypes(Ins);
+
    // Assign locations to all of the incoming arguments.
    SmallVector<CCValAssign, 16> ArgLocs;
    SystemZCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
@@ -855,6 +915,7 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
        case MVT::v8i16:
        case MVT::v4i32:
        case MVT::v2i64:
+      case MVT::v4f32:
        case MVT::v2f64:
          RC = &SystemZ::VR128BitRegClass;
          break;
@@ -873,14 +934,14 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
        // Create the SelectionDAG nodes corresponding to a load
        // from this parameter.  Unpromoted ints and floats are
        // passed as right-justified 8-byte values.
-      EVT PtrVT = getPointerTy();
+      EVT PtrVT = getPointerTy(DAG.getDataLayout());
        SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
        if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
          FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
                            DAG.getIntPtrConstant(4, DL));
        ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN,
-                             MachinePointerInfo::getFixedStack(FI),
-                             false, false, false, 0);
+                             MachinePointerInfo::getFixedStack(MF, FI), false,
+                             false, false, 0);
      }
  
      // Convert the value of the argument register into the value that's
@@ -911,14 +972,13 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
        for (unsigned I = NumFixedFPRs; I < SystemZ::NumArgFPRs; ++I) {
          unsigned Offset = TFL->getRegSpillOffset(SystemZ::ArgFPRs[I]);
          int FI = MFI->CreateFixedObject(8, RegSaveOffset + Offset, true);
-        SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+        SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
          unsigned VReg = MF.addLiveIn(SystemZ::ArgFPRs[I],
                                       &SystemZ::FP64BitRegClass);
          SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f64);
          MemOps[I] = DAG.getStore(ArgValue.getValue(1), DL, ArgValue, FIN,
-                                 MachinePointerInfo::getFixedStack(FI),
+                                 MachinePointerInfo::getFixedStack(MF, FI),
                                   false, false, 0);
-
        }
        // Join the stores, which are independent of one another.
        Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
@@ -961,7 +1021,13 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
    CallingConv::ID CallConv = CLI.CallConv;
    bool IsVarArg = CLI.IsVarArg;
    MachineFunction &MF = DAG.getMachineFunction();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(MF.getDataLayout());
+
+  // Detect unsupported vector argument and return types.
+  if (Subtarget.hasVector()) {
+    VerifyVectorTypes(Outs);
+    VerifyVectorTypes(Ins);
+  }
  
    // Analyze the operands of the call, assigning locations to each operand.
    SmallVector<CCValAssign, 16> ArgLocs;
@@ -994,9 +1060,9 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
        // Store the argument in a stack slot and pass its address.
        SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
        int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
-      MemOpChains.push_back(DAG.getStore(Chain, DL, ArgValue, SpillSlot,
-                                         MachinePointerInfo::getFixedStack(FI),
-                                         false, false, 0));
+      MemOpChains.push_back(DAG.getStore(
+          Chain, DL, ArgValue, SpillSlot,
+          MachinePointerInfo::getFixedStack(MF, FI), false, false, 0));
        ArgValue = SpillSlot;
      } else
        ArgValue = convertValVTToLocVT(DAG, DL, VA, ArgValue);
@@ -1109,6 +1175,20 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
    return Chain;
  }
  
+bool SystemZTargetLowering::
+CanLowerReturn(CallingConv::ID CallConv,
+               MachineFunction &MF, bool isVarArg,
+               const SmallVectorImpl<ISD::OutputArg> &Outs,
+               LLVMContext &Context) const {
+  // Detect unsupported vector return types.
+  if (Subtarget.hasVector())
+    VerifyVectorTypes(Outs);
+
+  SmallVector<CCValAssign, 16> RetLocs;
+  CCState RetCCInfo(CallConv, isVarArg, MF, RetLocs, Context);
+  return RetCCInfo.CheckReturn(Outs, RetCC_SystemZ);
+}
+
  SDValue
  SystemZTargetLowering::LowerReturn(SDValue Chain,
                                     CallingConv::ID CallConv, bool IsVarArg,
@@ -1117,6 +1197,10 @@ SystemZTargetLowering::LowerReturn(SDValue Chain,
                                     SDLoc DL, SelectionDAG &DAG) const {
    MachineFunction &MF = DAG.getMachineFunction();
  
+  // Detect unsupported vector return types.
+  if (Subtarget.hasVector())
+    VerifyVectorTypes(Outs);
+
    // Assign locations to each returned value.
    SmallVector<CCValAssign, 16> RetLocs;
    CCState RetCCInfo(CallConv, IsVarArg, MF, RetLocs, *DAG.getContext());
@@ -1187,6 +1271,143 @@ static bool isIntrinsicWithCCAndChain(SDValue Op, unsigned &Opcode,
    }
  }
  
+// Return true if Op is an intrinsic node without chain that returns the
+// CC value as its final argument.  Provide the associated SystemZISD
+// opcode and the mask of valid CC values if so.
+static bool isIntrinsicWithCC(SDValue Op, unsigned &Opcode, unsigned &CCValid) {
+  unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  switch (Id) {
+  case Intrinsic::s390_vpkshs:
+  case Intrinsic::s390_vpksfs:
+  case Intrinsic::s390_vpksgs:
+    Opcode = SystemZISD::PACKS_CC;
+    CCValid = SystemZ::CCMASK_VCMP;
+    return true;
+
+  case Intrinsic::s390_vpklshs:
+  case Intrinsic::s390_vpklsfs:
+  case Intrinsic::s390_vpklsgs:
+    Opcode = SystemZISD::PACKLS_CC;
+    CCValid = SystemZ::CCMASK_VCMP;
+    return true;
+
+  case Intrinsic::s390_vceqbs:
+  case Intrinsic::s390_vceqhs:
+  case Intrinsic::s390_vceqfs:
+  case Intrinsic::s390_vceqgs:
+    Opcode = SystemZISD::VICMPES;
+    CCValid = SystemZ::CCMASK_VCMP;
+    return true;
+
+  case Intrinsic::s390_vchbs:
+  case Intrinsic::s390_vchhs:
+  case Intrinsic::s390_vchfs:
+  case Intrinsic::s390_vchgs:
+    Opcode = SystemZISD::VICMPHS;
+    CCValid = SystemZ::CCMASK_VCMP;
+    return true;
+
+  case Intrinsic::s390_vchlbs:
+  case Intrinsic::s390_vchlhs:
+  case Intrinsic::s390_vchlfs:
+  case Intrinsic::s390_vchlgs:
+    Opcode = SystemZISD::VICMPHLS;
+    CCValid = SystemZ::CCMASK_VCMP;
+    return true;
+
+  case Intrinsic::s390_vtm:
+    Opcode = SystemZISD::VTM;
+    CCValid = SystemZ::CCMASK_VCMP;
+    return true;
+
+  case Intrinsic::s390_vfaebs:
+  case Intrinsic::s390_vfaehs:
+  case Intrinsic::s390_vfaefs:
+    Opcode = SystemZISD::VFAE_CC;
+    CCValid = SystemZ::CCMASK_ANY;
+    return true;
+
+  case Intrinsic::s390_vfaezbs:
+  case Intrinsic::s390_vfaezhs:
+  case Intrinsic::s390_vfaezfs:
+    Opcode = SystemZISD::VFAEZ_CC;
+    CCValid = SystemZ::CCMASK_ANY;
+    return true;
+
+  case Intrinsic::s390_vfeebs:
+  case Intrinsic::s390_vfeehs:
+  case Intrinsic::s390_vfeefs:
+    Opcode = SystemZISD::VFEE_CC;
+    CCValid = SystemZ::CCMASK_ANY;
+    return true;
+
+  case Intrinsic::s390_vfeezbs:
+  case Intrinsic::s390_vfeezhs:
+  case Intrinsic::s390_vfeezfs:
+    Opcode = SystemZISD::VFEEZ_CC;
+    CCValid = SystemZ::CCMASK_ANY;
+    return true;
+
+  case Intrinsic::s390_vfenebs:
+  case Intrinsic::s390_vfenehs:
+  case Intrinsic::s390_vfenefs:
+    Opcode = SystemZISD::VFENE_CC;
+    CCValid = SystemZ::CCMASK_ANY;
+    return true;
+
+  case Intrinsic::s390_vfenezbs:
+  case Intrinsic::s390_vfenezhs:
+  case Intrinsic::s390_vfenezfs:
+    Opcode = SystemZISD::VFENEZ_CC;
+    CCValid = SystemZ::CCMASK_ANY;
+    return true;
+
+  case Intrinsic::s390_vistrbs:
+  case Intrinsic::s390_vistrhs:
+  case Intrinsic::s390_vistrfs:
+    Opcode = SystemZISD::VISTR_CC;
+    CCValid = SystemZ::CCMASK_0 | SystemZ::CCMASK_3;
+    return true;
+
+  case Intrinsic::s390_vstrcbs:
+  case Intrinsic::s390_vstrchs:
+  case Intrinsic::s390_vstrcfs:
+    Opcode = SystemZISD::VSTRC_CC;
+    CCValid = SystemZ::CCMASK_ANY;
+    return true;
+
+  case Intrinsic::s390_vstrczbs:
+  case Intrinsic::s390_vstrczhs:
+  case Intrinsic::s390_vstrczfs:
+    Opcode = SystemZISD::VSTRCZ_CC;
+    CCValid = SystemZ::CCMASK_ANY;
+    return true;
+
+  case Intrinsic::s390_vfcedbs:
+    Opcode = SystemZISD::VFCMPES;
+    CCValid = SystemZ::CCMASK_VCMP;
+    return true;
+
+  case Intrinsic::s390_vfchdbs:
+    Opcode = SystemZISD::VFCMPHS;
+    CCValid = SystemZ::CCMASK_VCMP;
+    return true;
+
+  case Intrinsic::s390_vfchedbs:
+    Opcode = SystemZISD::VFCMPHES;
+    CCValid = SystemZ::CCMASK_VCMP;
+    return true;
+
+  case Intrinsic::s390_vftcidb:
+    Opcode = SystemZISD::VFTCI;
+    CCValid = SystemZ::CCMASK_VCMP;
+    return true;
+
+  default:
+    return false;
+  }
+}
+
  // Emit an intrinsic with chain with a glued value instead of its CC result.
  static SDValue emitIntrinsicWithChainAndGlue(SelectionDAG &DAG, SDValue Op,
                                               unsigned Opcode) {
@@ -1207,6 +1428,23 @@ static SDValue emitIntrinsicWithChainAndGlue(SelectionDAG &DAG, SDValue Op,
    return Intr;
  }
  
+// Emit an intrinsic with a glued value instead of its CC result.
+static SDValue emitIntrinsicWithGlue(SelectionDAG &DAG, SDValue Op,
+                                     unsigned Opcode) {
+  // Copy all operands except the intrinsic ID.
+  unsigned NumOps = Op.getNumOperands();
+  SmallVector<SDValue, 6> Ops;
+  Ops.reserve(NumOps - 1);
+  for (unsigned I = 1; I < NumOps; ++I)
+    Ops.push_back(Op.getOperand(I));
+
+  if (Op->getNumValues() == 1)
+    return DAG.getNode(Opcode, SDLoc(Op), MVT::Glue, Ops);
+  assert(Op->getNumValues() == 2 && "Expected exactly one non-CC result");
+  SDVTList RawVTs = DAG.getVTList(Op->getValueType(0), MVT::Glue);
+  return DAG.getNode(Opcode, SDLoc(Op), RawVTs, Ops);
+}
+
  // CC is a comparison that will be implemented using an integer or
  // floating-point comparison.  Return the condition code mask for
  // a branch on true.  In the integer case, CCMASK_CMP_UO is set for
@@ -1783,17 +2021,17 @@ static Comparison getIntrinsicCmp(SelectionDAG &DAG, unsigned Opcode,
    else if (Cond == ISD::SETLT || Cond == ISD::SETULT)
      // bits above bit 3 for CC==0 (always false), bits above bit 0 for CC==3,
      // always true for CC>3.
-    C.CCMask = CC < 4 ? -1 << (4 - CC) : -1;
+    C.CCMask = CC < 4 ? ~0U << (4 - CC) : -1;
    else if (Cond == ISD::SETGE || Cond == ISD::SETUGE)
      // ...and the inverse of that.
-    C.CCMask = CC < 4 ? ~(-1 << (4 - CC)) : 0;
+    C.CCMask = CC < 4 ? ~(~0U << (4 - CC)) : 0;
    else if (Cond == ISD::SETLE || Cond == ISD::SETULE)
      // bit 3 and above for CC==0, bit 0 and above for CC==3 (always true),
      // always true for CC>3.
-    C.CCMask = CC < 4 ? -1 << (3 - CC) : -1;
+    C.CCMask = CC < 4 ? ~0U << (3 - CC) : -1;
    else if (Cond == ISD::SETGT || Cond == ISD::SETUGT)
      // ...and the inverse of that.
-    C.CCMask = CC < 4 ? ~(-1 << (3 - CC)) : 0;
+    C.CCMask = CC < 4 ? ~(~0U << (3 - CC)) : 0;
    else
      llvm_unreachable("Unexpected integer comparison type");
    C.CCMask &= CCValid;
@@ -1810,6 +2048,10 @@ static Comparison getCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
          CmpOp0.getResNo() == 0 && CmpOp0->hasNUsesOfValue(1, 0) &&
          isIntrinsicWithCCAndChain(CmpOp0, Opcode, CCValid))
        return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid, Constant, Cond);
+    if (CmpOp0.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+        CmpOp0.getResNo() == CmpOp0->getNumValues() - 1 &&
+        isIntrinsicWithCC(CmpOp0, Opcode, CCValid))
+      return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid, Constant, Cond);
    }
    Comparison C(CmpOp0, CmpOp1);
    C.CCMask = CCMaskForCondCode(Cond);
@@ -1858,6 +2100,9 @@ static SDValue emitCmp(SelectionDAG &DAG, SDLoc DL, Comparison &C) {
      case ISD::INTRINSIC_W_CHAIN:
        Op = emitIntrinsicWithChainAndGlue(DAG, C.Op0, C.Opcode);
        break;
+    case ISD::INTRINSIC_WO_CHAIN:
+      Op = emitIntrinsicWithGlue(DAG, C.Op0, C.Opcode);
+      break;
      default:
        llvm_unreachable("Invalid comparison operands");
      }
@@ -1943,14 +2188,14 @@ static unsigned getVectorComparison(ISD::CondCode CC, bool IsFP) {
  
    case ISD::SETOGE:
    case ISD::SETGE:
-    return IsFP ? SystemZISD::VFCMPHE : 0;
+    return IsFP ? SystemZISD::VFCMPHE : static_cast<SystemZISD::NodeType>(0);
  
    case ISD::SETOGT:
    case ISD::SETGT:
      return IsFP ? SystemZISD::VFCMPH : SystemZISD::VICMPH;
  
    case ISD::SETUGT:
-    return IsFP ? 0 : SystemZISD::VICMPHL;
+    return IsFP ? static_cast<SystemZISD::NodeType>(0) : SystemZISD::VICMPHL;
  
    default:
      return 0;
@@ -1977,6 +2222,33 @@ static unsigned getVectorComparisonOrInvert(ISD::CondCode CC, bool IsFP,
    return 0;
  }
  
+// Return a v2f64 that contains the extended form of elements Start and Start+1
+// of v4f32 value Op.
+static SDValue expandV4F32ToV2F64(SelectionDAG &DAG, int Start, SDLoc DL,
+                                  SDValue Op) {
+  int Mask[] = { Start, -1, Start + 1, -1 };
+  Op = DAG.getVectorShuffle(MVT::v4f32, DL, Op, DAG.getUNDEF(MVT::v4f32), Mask);
+  return DAG.getNode(SystemZISD::VEXTEND, DL, MVT::v2f64, Op);
+}
+
+// Build a comparison of vectors CmpOp0 and CmpOp1 using opcode Opcode,
+// producing a result of type VT.
+static SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode, SDLoc DL,
+                            EVT VT, SDValue CmpOp0, SDValue CmpOp1) {
+  // There is no hardware support for v4f32, so extend the vector into
+  // two v2f64s and compare those.
+  if (CmpOp0.getValueType() == MVT::v4f32) {
+    SDValue H0 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp0);
+    SDValue L0 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp0);
+    SDValue H1 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp1);
+    SDValue L1 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp1);
+    SDValue HRes = DAG.getNode(Opcode, DL, MVT::v2i64, H0, H1);
+    SDValue LRes = DAG.getNode(Opcode, DL, MVT::v2i64, L0, L1);
+    return DAG.getNode(SystemZISD::PACK, DL, VT, HRes, LRes);
+  }
+  return DAG.getNode(Opcode, DL, VT, CmpOp0, CmpOp1);
+}
+
  // Lower a vector comparison of type CC between CmpOp0 and CmpOp1, producing
  // an integer mask of type VT.
  static SDValue lowerVectorSETCC(SelectionDAG &DAG, SDLoc DL, EVT VT,
@@ -1991,8 +2263,8 @@ static SDValue lowerVectorSETCC(SelectionDAG &DAG, SDLoc DL, EVT VT,
      Invert = true;
    case ISD::SETO: {
      assert(IsFP && "Unexpected integer comparison");
-    SDValue LT = DAG.getNode(SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0);
-    SDValue GE = DAG.getNode(SystemZISD::VFCMPHE, DL, VT, CmpOp0, CmpOp1);
+    SDValue LT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0);
+    SDValue GE = getVectorCmp(DAG, SystemZISD::VFCMPHE, DL, VT, CmpOp0, CmpOp1);
      Cmp = DAG.getNode(ISD::OR, DL, VT, LT, GE);
      break;
    }
@@ -2002,8 +2274,8 @@ static SDValue lowerVectorSETCC(SelectionDAG &DAG, SDLoc DL, EVT VT,
      Invert = true;
    case ISD::SETONE: {
      assert(IsFP && "Unexpected integer comparison");
-    SDValue LT = DAG.getNode(SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0);
-    SDValue GT = DAG.getNode(SystemZISD::VFCMPH, DL, VT, CmpOp0, CmpOp1);
+    SDValue LT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0);
+    SDValue GT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp0, CmpOp1);
      Cmp = DAG.getNode(ISD::OR, DL, VT, LT, GT);
      break;
    }
@@ -2013,11 +2285,11 @@ static SDValue lowerVectorSETCC(SelectionDAG &DAG, SDLoc DL, EVT VT,
      // there are no cases where both work.
    default:
      if (unsigned Opcode = getVectorComparisonOrInvert(CC, IsFP, Invert))
-      Cmp = DAG.getNode(Opcode, DL, VT, CmpOp0, CmpOp1);
+      Cmp = getVectorCmp(DAG, Opcode, DL, VT, CmpOp0, CmpOp1);
      else {
        CC = ISD::getSetCCSwappedOperands(CC);
        if (unsigned Opcode = getVectorComparisonOrInvert(CC, IsFP, Invert))
-        Cmp = DAG.getNode(Opcode, DL, VT, CmpOp1, CmpOp0);
+        Cmp = getVectorCmp(DAG, Opcode, DL, VT, CmpOp1, CmpOp0);
        else
          llvm_unreachable("Unhandled comparison");
      }
@@ -2145,7 +2417,7 @@ SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
    SDLoc DL(Node);
    const GlobalValue *GV = Node->getGlobal();
    int64_t Offset = Node->getOffset();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
    Reloc::Model RM = DAG.getTarget().getRelocationModel();
    CodeModel::Model CM = DAG.getTarget().getCodeModel();
  
@@ -2167,7 +2439,8 @@ SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
      Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, SystemZII::MO_GOT);
      Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
      Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
-                         MachinePointerInfo::getGOT(), false, false, false, 0);
+                         MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+                         false, false, false, 0);
    }
  
    // If there was a non-zero offset that we didn't fold, create an explicit
@@ -2184,7 +2457,7 @@ SDValue SystemZTargetLowering::lowerTLSGetOffset(GlobalAddressSDNode *Node,
                                                   unsigned Opcode,
                                                   SDValue GOTOffset) const {
    SDLoc DL(Node);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
    SDValue Chain = DAG.getEntryNode();
    SDValue Glue;
  
@@ -2228,9 +2501,11 @@ SDValue SystemZTargetLowering::lowerTLSGetOffset(GlobalAddressSDNode *Node,
  
  SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
                                                      SelectionDAG &DAG) const {
+  if (DAG.getTarget().Options.EmulatedTLS)
+    return LowerToTLSEmulatedModel(Node, DAG);
    SDLoc DL(Node);
    const GlobalValue *GV = Node->getGlobal();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
    TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
  
    // The high part of the thread pointer is in access register 0.
@@ -2257,9 +2532,10 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
          SystemZConstantPoolValue::Create(GV, SystemZCP::TLSGD);
  
        Offset = DAG.getConstantPool(CPV, PtrVT, 8);
-      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
-                           Offset, MachinePointerInfo::getConstantPool(),
-                           false, false, false, 0);
+      Offset = DAG.getLoad(
+          PtrVT, DL, DAG.getEntryNode(), Offset,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+          false, false, 0);
  
        // Call __tls_get_offset to retrieve the offset.
        Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_GDCALL, Offset);
@@ -2272,9 +2548,10 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
          SystemZConstantPoolValue::Create(GV, SystemZCP::TLSLDM);
  
        Offset = DAG.getConstantPool(CPV, PtrVT, 8);
-      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
-                           Offset, MachinePointerInfo::getConstantPool(),
-                           false, false, false, 0);
+      Offset = DAG.getLoad(
+          PtrVT, DL, DAG.getEntryNode(), Offset,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+          false, false, 0);
  
        // Call __tls_get_offset to retrieve the module base offset.
        Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_LDCALL, Offset);
@@ -2290,9 +2567,10 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
        CPV = SystemZConstantPoolValue::Create(GV, SystemZCP::DTPOFF);
  
        SDValue DTPOffset = DAG.getConstantPool(CPV, PtrVT, 8);
-      DTPOffset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
-                              DTPOffset, MachinePointerInfo::getConstantPool(),
-                              false, false, false, 0);
+      DTPOffset = DAG.getLoad(
+          PtrVT, DL, DAG.getEntryNode(), DTPOffset,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+          false, false, 0);
  
        Offset = DAG.getNode(ISD::ADD, DL, PtrVT, Offset, DTPOffset);
        break;
@@ -2303,8 +2581,8 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
        Offset = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
                                            SystemZII::MO_INDNTPOFF);
        Offset = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Offset);
-      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
-                           Offset, MachinePointerInfo::getGOT(),
+      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Offset,
+                           MachinePointerInfo::getGOT(DAG.getMachineFunction()),
                             false, false, false, 0);
        break;
      }
@@ -2315,9 +2593,10 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
          SystemZConstantPoolValue::Create(GV, SystemZCP::NTPOFF);
  
        Offset = DAG.getConstantPool(CPV, PtrVT, 8);
-      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
-                           Offset, MachinePointerInfo::getConstantPool(),
-                           false, false, false, 0);
+      Offset = DAG.getLoad(
+          PtrVT, DL, DAG.getEntryNode(), Offset,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+          false, false, 0);
        break;
      }
    }
@@ -2331,7 +2610,7 @@ SDValue SystemZTargetLowering::lowerBlockAddress(BlockAddressSDNode *Node,
    SDLoc DL(Node);
    const BlockAddress *BA = Node->getBlockAddress();
    int64_t Offset = Node->getOffset();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
  
    SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset);
    Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
@@ -2341,7 +2620,7 @@ SDValue SystemZTargetLowering::lowerBlockAddress(BlockAddressSDNode *Node,
  SDValue SystemZTargetLowering::lowerJumpTable(JumpTableSDNode *JT,
                                                SelectionDAG &DAG) const {
    SDLoc DL(JT);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
    SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
  
    // Use LARL to load the address of the table.
@@ -2351,7 +2630,7 @@ SDValue SystemZTargetLowering::lowerJumpTable(JumpTableSDNode *JT,
  SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP,
                                                   SelectionDAG &DAG) const {
    SDLoc DL(CP);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
  
    SDValue Result;
    if (CP->isMachineConstantPoolEntry())
@@ -2415,7 +2694,7 @@ SDValue SystemZTargetLowering::lowerVASTART(SDValue Op,
    MachineFunction &MF = DAG.getMachineFunction();
    SystemZMachineFunctionInfo *FuncInfo =
      MF.getInfo<SystemZMachineFunctionInfo>();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
  
    SDValue Chain   = Op.getOperand(0);
    SDValue Addr    = Op.getOperand(1);
@@ -2965,6 +3244,67 @@ SystemZTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
    return SDValue();
  }
  
+SDValue
+SystemZTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  unsigned Opcode, CCValid;
+  if (isIntrinsicWithCC(Op, Opcode, CCValid)) {
+    SDValue Glued = emitIntrinsicWithGlue(DAG, Op, Opcode);
+    SDValue CC = getCCResult(DAG, Glued.getNode());
+    if (Op->getNumValues() == 1)
+      return CC;
+    assert(Op->getNumValues() == 2 && "Expected a CC and non-CC result");
+    return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), Op->getVTList(),
+                   Glued, CC);
+  }
+
+  unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  switch (Id) {
+  case Intrinsic::s390_vpdi:
+    return DAG.getNode(SystemZISD::PERMUTE_DWORDS, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+  case Intrinsic::s390_vperm:
+    return DAG.getNode(SystemZISD::PERMUTE, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+  case Intrinsic::s390_vuphb:
+  case Intrinsic::s390_vuphh:
+  case Intrinsic::s390_vuphf:
+    return DAG.getNode(SystemZISD::UNPACK_HIGH, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1));
+
+  case Intrinsic::s390_vuplhb:
+  case Intrinsic::s390_vuplhh:
+  case Intrinsic::s390_vuplhf:
+    return DAG.getNode(SystemZISD::UNPACKL_HIGH, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1));
+
+  case Intrinsic::s390_vuplb:
+  case Intrinsic::s390_vuplhw:
+  case Intrinsic::s390_vuplf:
+    return DAG.getNode(SystemZISD::UNPACK_LOW, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1));
+
+  case Intrinsic::s390_vupllb:
+  case Intrinsic::s390_vupllh:
+  case Intrinsic::s390_vupllf:
+    return DAG.getNode(SystemZISD::UNPACKL_LOW, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1));
+
+  case Intrinsic::s390_vsumb:
+  case Intrinsic::s390_vsumh:
+  case Intrinsic::s390_vsumgh:
+  case Intrinsic::s390_vsumgf:
+  case Intrinsic::s390_vsumqf:
+  case Intrinsic::s390_vsumqg:
+    return DAG.getNode(SystemZISD::VSUM, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
+
+  return SDValue();
+}
+
  namespace {
  // Says that SystemZISD operation Opcode can be used to perform the equivalent
  // of a VPERM with permute vector Bytes.  If Opcode takes three operands,
@@ -3480,7 +3820,7 @@ static bool tryBuildVectorByteMask(BuildVectorSDNode *BVN, uint64_t &Mask) {
        for (unsigned J = 0; J < BytesPerElement; ++J) {
          uint64_t Byte = (Value >> (J * 8)) & 0xff;
          if (Byte == 0xff)
-          Mask |= 1 << ((E - I - 1) * BytesPerElement + J);
+          Mask |= 1ULL << ((E - I - 1) * BytesPerElement + J);
          else if (Byte != 0)
            return false;
        }
@@ -3557,7 +3897,7 @@ static SDValue tryBuildVectorShuffle(SelectionDAG &DAG,
        GS.addUndef();
      } else {
        GS.add(SDValue(), ResidueOps.size());
-      ResidueOps.push_back(Op);
+      ResidueOps.push_back(BVN->getOperand(I));
      }
    }
  
@@ -3621,6 +3961,31 @@ static SDValue buildVector(SelectionDAG &DAG, SDLoc DL, EVT VT,
    if (VT == MVT::v2f64)
      return buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
  
+  // Build v4f32 values directly from the FPRs:
+  //
+  //   <Axxx> <Bxxx> <Cxxxx> <Dxxx>
+  //         V              V         VMRHF
+  //      <ABxx>         <CDxx>
+  //                V                 VMRHG
+  //              <ABCD>
+  if (VT == MVT::v4f32) {
+    SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
+    SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[2], Elems[3]);
+    // Avoid unnecessary undefs by reusing the other operand.
+    if (Op01.getOpcode() == ISD::UNDEF)
+      Op01 = Op23;
+    else if (Op23.getOpcode() == ISD::UNDEF)
+      Op23 = Op01;
+    // Merging identical replications is a no-op.
+    if (Op01.getOpcode() == SystemZISD::REPLICATE && Op01 == Op23)
+      return Op01;
+    Op01 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op01);
+    Op23 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op23);
+    SDValue Op = DAG.getNode(SystemZISD::MERGE_HIGH,
+                             DL, MVT::v2i64, Op01, Op23);
+    return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+  }
+
    // Collect the constant terms.
    SmallVector<SDValue, SystemZ::VectorBytes> Constants(NumElements, SDValue());
    SmallVector<bool, SystemZ::VectorBytes> Done(NumElements, false);
@@ -3796,10 +4161,11 @@ SDValue SystemZTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
    SDValue Op2 = Op.getOperand(2);
    EVT VT = Op.getValueType();
  
-  // Insertions into constant indices can be done using VPDI.  However,
-  // if the inserted value is a bitcast or a constant then it's better
-  // to use GPRs, as below.
-  if (Op1.getOpcode() != ISD::BITCAST &&
+  // Insertions into constant indices of a v2f64 can be done using VPDI.
+  // However, if the inserted value is a bitcast or a constant then it's
+  // better to use GPRs, as below.
+  if (VT == MVT::v2f64 &&
+      Op1.getOpcode() != ISD::BITCAST &&
        Op1.getOpcode() != ISD::ConstantFP &&
        Op2.getOpcode() == ISD::Constant) {
      uint64_t Index = dyn_cast<ConstantSDNode>(Op2)->getZExtValue();
@@ -3843,6 +4209,23 @@ SystemZTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
    return DAG.getNode(ISD::BITCAST, DL, VT, Res);
  }
  
+SDValue
+SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
+                                             unsigned UnpackHigh) const {
+  SDValue PackedOp = Op.getOperand(0);
+  EVT OutVT = Op.getValueType();
+  EVT InVT = PackedOp.getValueType();
+  unsigned ToBits = OutVT.getVectorElementType().getSizeInBits();
+  unsigned FromBits = InVT.getVectorElementType().getSizeInBits();
+  do {
+    FromBits *= 2;
+    EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits),
+                                 SystemZ::VectorBits / FromBits);
+    PackedOp = DAG.getNode(UnpackHigh, SDLoc(PackedOp), OutVT, PackedOp);
+  } while (FromBits != ToBits);
+  return PackedOp;
+}
+
  SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
                                            unsigned ByScalar) const {
    // Look for cases where a vector shift can use the *_BY_SCALAR form.
@@ -3981,6 +4364,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
      return lowerPREFETCH(Op, DAG);
    case ISD::INTRINSIC_W_CHAIN:
      return lowerINTRINSIC_W_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN:
+    return lowerINTRINSIC_WO_CHAIN(Op, DAG);
    case ISD::BUILD_VECTOR:
      return lowerBUILD_VECTOR(Op, DAG);
    case ISD::VECTOR_SHUFFLE:
@@ -3991,6 +4376,10 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
      return lowerINSERT_VECTOR_ELT(Op, DAG);
    case ISD::EXTRACT_VECTOR_ELT:
      return lowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+    return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACK_HIGH);
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+    return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACKL_HIGH);
    case ISD::SHL:
      return lowerShift(Op, DAG, SystemZISD::VSHL_BY_SCALAR);
    case ISD::SRL:
@@ -4004,7 +4393,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
  
  const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
  #define OPCODE(NAME) case SystemZISD::NAME: return "SystemZISD::" #NAME
-  switch (Opcode) {
+  switch ((SystemZISD::NodeType)Opcode) {
+    case SystemZISD::FIRST_NUMBER: break;
      OPCODE(RET_FLAG);
      OPCODE(CALL);
      OPCODE(SIBCALL);
@@ -4055,6 +4445,12 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
      OPCODE(PERMUTE_DWORDS);
      OPCODE(PERMUTE);
      OPCODE(PACK);
+    OPCODE(PACKS_CC);
+    OPCODE(PACKLS_CC);
+    OPCODE(UNPACK_HIGH);
+    OPCODE(UNPACKL_HIGH);
+    OPCODE(UNPACK_LOW);
+    OPCODE(UNPACKL_LOW);
      OPCODE(VSHL_BY_SCALAR);
      OPCODE(VSRL_BY_SCALAR);
      OPCODE(VSRA_BY_SCALAR);
@@ -4062,9 +4458,28 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
      OPCODE(VICMPE);
      OPCODE(VICMPH);
      OPCODE(VICMPHL);
+    OPCODE(VICMPES);
+    OPCODE(VICMPHS);
+    OPCODE(VICMPHLS);
      OPCODE(VFCMPE);
      OPCODE(VFCMPH);
      OPCODE(VFCMPHE);
+    OPCODE(VFCMPES);
+    OPCODE(VFCMPHS);
+    OPCODE(VFCMPHES);
+    OPCODE(VFTCI);
+    OPCODE(VEXTEND);
+    OPCODE(VROUND);
+    OPCODE(VTM);
+    OPCODE(VFAE_CC);
+    OPCODE(VFAEZ_CC);
+    OPCODE(VFEE_CC);
+    OPCODE(VFEEZ_CC);
+    OPCODE(VFENE_CC);
+    OPCODE(VFENEZ_CC);
+    OPCODE(VISTR_CC);
+    OPCODE(VSTRC_CC);
+    OPCODE(VSTRCZ_CC);
      OPCODE(ATOMIC_SWAPW);
      OPCODE(ATOMIC_LOADW_ADD);
      OPCODE(ATOMIC_LOADW_SUB);
@@ -4265,6 +4680,37 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
        }
      }
    }
+  if (Opcode == SystemZISD::MERGE_HIGH ||
+      Opcode == SystemZISD::MERGE_LOW) {
+    SDValue Op0 = N->getOperand(0);
+    SDValue Op1 = N->getOperand(1);
+    if (Op0.getOpcode() == ISD::BITCAST)
+      Op0 = Op0.getOperand(0);
+    if (Op0.getOpcode() == SystemZISD::BYTE_MASK &&
+        cast<ConstantSDNode>(Op0.getOperand(0))->getZExtValue() == 0) {
+      // (z_merge_* 0, 0) -> 0.  This is mostly useful for using VLLEZF
+      // for v4f32.
+      if (Op1 == N->getOperand(0))
+        return Op1;
+      // (z_merge_? 0, X) -> (z_unpackl_? 0, X).
+      EVT VT = Op1.getValueType();
+      unsigned ElemBytes = VT.getVectorElementType().getStoreSize();
+      if (ElemBytes <= 4) {
+        Opcode = (Opcode == SystemZISD::MERGE_HIGH ?
+                  SystemZISD::UNPACKL_HIGH : SystemZISD::UNPACKL_LOW);
+        EVT InVT = VT.changeVectorElementTypeToInteger();
+        EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(ElemBytes * 16),
+                                     SystemZ::VectorBytes / ElemBytes / 2);
+        if (VT != InVT) {
+          Op1 = DAG.getNode(ISD::BITCAST, SDLoc(N), InVT, Op1);
+          DCI.AddToWorklist(Op1.getNode());
+        }
+        SDValue Op = DAG.getNode(Opcode, SDLoc(N), OutVT, Op1);
+        DCI.AddToWorklist(Op.getNode());
+        return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
+      }
+    }
+  }
    // If we have (truncstoreiN (extract_vector_elt X, Y), Z) then it is better
    // for the extraction to be done on a vMiN value, so that we can use VSTE.
    // If X has wider elements then convert it to:
@@ -4299,6 +4745,49 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
        N->getOperand(0) == N->getOperand(1))
      return DAG.getNode(SystemZISD::REPLICATE, SDLoc(N), N->getValueType(0),
                         N->getOperand(0));
+  // (fround (extract_vector_elt X 0))
+  // (fround (extract_vector_elt X 1)) ->
+  // (extract_vector_elt (VROUND X) 0)
+  // (extract_vector_elt (VROUND X) 1)
+  //
+  // This is a special case since the target doesn't really support v2f32s.
+  if (Opcode == ISD::FP_ROUND) {
+    SDValue Op0 = N->getOperand(0);
+    if (N->getValueType(0) == MVT::f32 &&
+        Op0.hasOneUse() &&
+        Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+        Op0.getOperand(0).getValueType() == MVT::v2f64 &&
+        Op0.getOperand(1).getOpcode() == ISD::Constant &&
+        cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue() == 0) {
+      SDValue Vec = Op0.getOperand(0);
+      for (auto *U : Vec->uses()) {
+        if (U != Op0.getNode() &&
+            U->hasOneUse() &&
+            U->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+            U->getOperand(0) == Vec &&
+            U->getOperand(1).getOpcode() == ISD::Constant &&
+            cast<ConstantSDNode>(U->getOperand(1))->getZExtValue() == 1) {
+          SDValue OtherRound = SDValue(*U->use_begin(), 0);
+          if (OtherRound.getOpcode() == ISD::FP_ROUND &&
+              OtherRound.getOperand(0) == SDValue(U, 0) &&
+              OtherRound.getValueType() == MVT::f32) {
+            SDValue VRound = DAG.getNode(SystemZISD::VROUND, SDLoc(N),
+                                         MVT::v4f32, Vec);
+            DCI.AddToWorklist(VRound.getNode());
+            SDValue Extract1 =
+              DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(U), MVT::f32,
+                          VRound, DAG.getConstant(2, SDLoc(U), MVT::i32));
+            DCI.AddToWorklist(Extract1.getNode());
+            DAG.ReplaceAllUsesOfValueWith(OtherRound, Extract1);
+            SDValue Extract0 =
+              DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32,
+                          VRound, DAG.getConstant(0, SDLoc(Op0), MVT::i32));
+            return Extract0;
+          }
+        }
+      }
+    }
+  }
    return SDValue();
  }