[SystemZ] Use POPCNT instruction on z196

[oota-llvm.git] / lib / Target / SystemZ / SystemZISelLowering.cpp
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp

index 19da96a04b5d596b1359d92453597d7a9744ab68..e0cb376d11d254a30a81abd925b16695703cbe0d 100644 (file)
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -80,9 +80,9 @@ static MachineOperand earlyUseOperand(MachineOperand Op) {
    return Op;
  }
  
-SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
-  : TargetLowering(tm, new TargetLoweringObjectFileELF()),
-    Subtarget(*tm.getSubtargetImpl()), TM(tm) {
+SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
+                                             const SystemZSubtarget &STI)
+    : TargetLowering(tm), Subtarget(STI) {
    MVT PtrVT = getPointerTy();
  
    // Set up the register classes.
@@ -96,7 +96,7 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
    addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);
  
    // Compute derived properties from the register classes
-  computeRegisterProperties();
+  computeRegisterProperties(Subtarget.getRegisterInfo());
  
    // Set up special registers.
    setExceptionPointerRegister(SystemZ::R6D);
@@ -163,8 +163,13 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
        // available, or if the operand is constant.
        setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
  
+      // Use POPCNT on z196 and above.
+      if (Subtarget.hasPopulationCount())
+        setOperationAction(ISD::CTPOP, VT, Custom);
+      else
+        setOperationAction(ISD::CTPOP, VT, Expand);
+
        // No special instructions for these.
-      setOperationAction(ISD::CTPOP,           VT, Expand);
        setOperationAction(ISD::CTTZ,            VT, Expand);
        setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
        setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
@@ -218,10 +223,12 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
    setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
  
    // We have native instructions for i8, i16 and i32 extensions, but not i1.
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
-  setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::EXTLOAD,  VT, MVT::i1, Promote);
+  }
  
    // Handle the various types of symbolic address.
    setOperationAction(ISD::ConstantPool,     PtrVT, Custom);
@@ -275,7 +282,8 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
    // Needed so that we don't try to implement f128 constant loads using
    // a load-and-extend of a f80 constant (in cases where the constant
    // would fit in an f80).
-  setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand);
+  for (MVT VT : MVT::fp_valuetypes())
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
  
    // Floating-point truncation and stores need to be done separately.
    setTruncStoreAction(MVT::f64,  MVT::f32, Expand);
@@ -339,9 +347,20 @@ bool SystemZTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
    return Imm.isZero() || Imm.isNegZero();
  }
  
-bool SystemZTargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
-                                                          unsigned,
-                                                          bool *Fast) const {
+bool SystemZTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+  // We can use CGFI or CLGFI.
+  return isInt<32>(Imm) || isUInt<32>(Imm);
+}
+
+bool SystemZTargetLowering::isLegalAddImmediate(int64_t Imm) const {
+  // We can use ALGFI or SLGFI.
+  return isUInt<32>(Imm) || isUInt<32>(-Imm);
+}
+
+bool SystemZTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+                                                           unsigned,
+                                                           unsigned,
+                                                           bool *Fast) const {
    // Unaligned accesses should never be slower than the expanded version.
    // We check specifically for aligned accesses in the few cases where
    // they are required.
@@ -495,8 +514,10 @@ parseRegisterNumber(const std::string &Constraint,
    return std::make_pair(0U, nullptr);
  }
  
-std::pair<unsigned, const TargetRegisterClass *> SystemZTargetLowering::
-getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const {
+std::pair<unsigned, const TargetRegisterClass *>
+SystemZTargetLowering::getRegForInlineAsmConstraint(
+    const TargetRegisterInfo *TRI, const std::string &Constraint,
+    MVT VT) const {
    if (Constraint.size() == 1) {
      // GCC Constraint Letters
      switch (Constraint[0]) {
@@ -553,7 +574,7 @@ getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const {
                                   SystemZMC::FP64Regs);
      }
    }
-  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
  }
  
  void SystemZTargetLowering::
@@ -672,12 +693,13 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
    MachineFrameInfo *MFI = MF.getFrameInfo();
    MachineRegisterInfo &MRI = MF.getRegInfo();
    SystemZMachineFunctionInfo *FuncInfo =
-    MF.getInfo<SystemZMachineFunctionInfo>();
-  auto *TFL = static_cast<const SystemZFrameLowering *>(TM.getFrameLowering());
+      MF.getInfo<SystemZMachineFunctionInfo>();
+  auto *TFL =
+      static_cast<const SystemZFrameLowering *>(Subtarget.getFrameLowering());
  
    // Assign locations to all of the incoming arguments.
    SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, IsVarArg, MF, TM, ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
    CCInfo.AnalyzeFormalArguments(Ins, CC_SystemZ);
  
    unsigned NumFixedGPRs = 0;
@@ -772,15 +794,15 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
        }
        // Join the stores, which are independent of one another.
        Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                          ArrayRef<SDValue>(&MemOps[NumFixedFPRs],
-                                            SystemZ::NumArgFPRs-NumFixedFPRs));
+                          makeArrayRef(&MemOps[NumFixedFPRs],
+                                       SystemZ::NumArgFPRs-NumFixedFPRs));
      }
    }
  
    return Chain;
  }
  
-static bool canUseSiblingCall(CCState ArgCCInfo,
+static bool canUseSiblingCall(const CCState &ArgCCInfo,
                                SmallVectorImpl<CCValAssign> &ArgLocs) {
    // Punt if there are any indirect or stack arguments, or if the call
    // needs the call-saved argument register R6.
@@ -815,7 +837,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
  
    // Analyze the operands of the call, assigning locations to each operand.
    SmallVector<CCValAssign, 16> ArgLocs;
-  CCState ArgCCInfo(CallConv, IsVarArg, MF, TM, ArgLocs, *DAG.getContext());
+  CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
    ArgCCInfo.AnalyzeCallOperands(Outs, CC_SystemZ);
  
    // We don't support GuaranteedTailCallOpt, only automatically-detected
@@ -911,6 +933,12 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
      Ops.push_back(DAG.getRegister(RegsToPass[I].first,
                                    RegsToPass[I].second.getValueType()));
  
+  // Add a register mask operand representing the call-preserved registers.
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
    // Glue the call to the argument copies, if any.
    if (Glue.getNode())
      Ops.push_back(Glue);
@@ -931,7 +959,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
  
    // Assign locations to each value returned by this call.
    SmallVector<CCValAssign, 16> RetLocs;
-  CCState RetCCInfo(CallConv, IsVarArg, MF, TM, RetLocs, *DAG.getContext());
+  CCState RetCCInfo(CallConv, IsVarArg, MF, RetLocs, *DAG.getContext());
    RetCCInfo.AnalyzeCallResult(Ins, RetCC_SystemZ);
  
    // Copy all of the result registers out of their specified physreg.
@@ -962,7 +990,7 @@ SystemZTargetLowering::LowerReturn(SDValue Chain,
  
    // Assign locations to each returned value.
    SmallVector<CCValAssign, 16> RetLocs;
-  CCState RetCCInfo(CallConv, IsVarArg, MF, TM, RetLocs, *DAG.getContext());
+  CCState RetCCInfo(CallConv, IsVarArg, MF, RetLocs, *DAG.getContext());
    RetCCInfo.AnalyzeReturn(Outs, RetCC_SystemZ);
  
    // Quick exit for void returns
@@ -1180,7 +1208,7 @@ static void adjustSubwordCmp(SelectionDAG &DAG, Comparison &C) {
                             Load->getChain(), Load->getBasePtr(),
                             Load->getPointerInfo(), Load->getMemoryVT(),
                             Load->isVolatile(), Load->isNonTemporal(),
-                           Load->getAlignment());
+                           Load->isInvariant(), Load->getAlignment());
  
    // Make sure that the second operand is an i32 with the right value.
    if (C.Op1.getValueType() != MVT::i32 ||
@@ -1517,6 +1545,8 @@ static void adjustForTestUnderMask(SelectionDAG &DAG, Comparison &C) {
      MaskVal = -(CmpVal & -CmpVal);
      NewC.ICmpType = SystemZICMP::UnsignedOnly;
    }
+  if (!MaskVal)
+    return;
  
    // Check whether the combination of mask, comparison value and comparison
    // type are suitable.
@@ -1769,12 +1799,8 @@ SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
      }
    }
  
-  SmallVector<SDValue, 5> Ops;
-  Ops.push_back(TrueOp);
-  Ops.push_back(FalseOp);
-  Ops.push_back(DAG.getConstant(C.CCValid, MVT::i32));
-  Ops.push_back(DAG.getConstant(C.CCMask, MVT::i32));
-  Ops.push_back(Glue);
+  SDValue Ops[] = {TrueOp, FalseOp, DAG.getConstant(C.CCValid, MVT::i32),
+                   DAG.getConstant(C.CCMask, MVT::i32), Glue};
  
    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
    return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VTs, Ops);
@@ -1786,8 +1812,8 @@ SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
    const GlobalValue *GV = Node->getGlobal();
    int64_t Offset = Node->getOffset();
    EVT PtrVT = getPointerTy();
-  Reloc::Model RM = TM.getRelocationModel();
-  CodeModel::Model CM = TM.getCodeModel();
+  Reloc::Model RM = DAG.getTarget().getRelocationModel();
+  CodeModel::Model CM = DAG.getTarget().getCodeModel();
  
    SDValue Result;
    if (Subtarget.isPC32DBLSymbol(GV, RM, CM)) {
@@ -1819,15 +1845,59 @@ SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
    return Result;
  }
  
+SDValue SystemZTargetLowering::lowerTLSGetOffset(GlobalAddressSDNode *Node,
+                                                 SelectionDAG &DAG,
+                                                 unsigned Opcode,
+                                                 SDValue GOTOffset) const {
+  SDLoc DL(Node);
+  EVT PtrVT = getPointerTy();
+  SDValue Chain = DAG.getEntryNode();
+  SDValue Glue;
+
+  // __tls_get_offset takes the GOT offset in %r2 and the GOT in %r12.
+  SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
+  Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R12D, GOT, Glue);
+  Glue = Chain.getValue(1);
+  Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R2D, GOTOffset, Glue);
+  Glue = Chain.getValue(1);
+
+  // The first call operand is the chain and the second is the TLS symbol.
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(DAG.getTargetGlobalAddress(Node->getGlobal(), DL,
+                                           Node->getValueType(0),
+                                           0, 0));
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  Ops.push_back(DAG.getRegister(SystemZ::R2D, PtrVT));
+  Ops.push_back(DAG.getRegister(SystemZ::R12D, PtrVT));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+  const uint32_t *Mask =
+      TRI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  // Glue the call to the argument copies.
+  Ops.push_back(Glue);
+
+  // Emit the call.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  Chain = DAG.getNode(Opcode, DL, NodeTys, Ops);
+  Glue = Chain.getValue(1);
+
+  // Copy the return value from %r2.
+  return DAG.getCopyFromReg(Chain, DL, SystemZ::R2D, PtrVT, Glue);
+}
+
  SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
                                                      SelectionDAG &DAG) const {
    SDLoc DL(Node);
    const GlobalValue *GV = Node->getGlobal();
    EVT PtrVT = getPointerTy();
-  TLSModel::Model model = TM.getTLSModel(GV);
-
-  if (model != TLSModel::LocalExec)
-    llvm_unreachable("only local-exec TLS mode supported");
+  TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
  
    // The high part of the thread pointer is in access register 0.
    SDValue TPHi = DAG.getNode(SystemZISD::EXTRACT_ACCESS, DL, MVT::i32,
@@ -1844,15 +1914,79 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
                                     DAG.getConstant(32, PtrVT));
    SDValue TP = DAG.getNode(ISD::OR, DL, PtrVT, TPHiShifted, TPLo);
  
-  // Get the offset of GA from the thread pointer.
-  SystemZConstantPoolValue *CPV =
-    SystemZConstantPoolValue::Create(GV, SystemZCP::NTPOFF);
+  // Get the offset of GA from the thread pointer, based on the TLS model.
+  SDValue Offset;
+  switch (model) {
+    case TLSModel::GeneralDynamic: {
+      // Load the GOT offset of the tls_index (module ID / per-symbol offset).
+      SystemZConstantPoolValue *CPV =
+        SystemZConstantPoolValue::Create(GV, SystemZCP::TLSGD);
+
+      Offset = DAG.getConstantPool(CPV, PtrVT, 8);
+      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
+                           Offset, MachinePointerInfo::getConstantPool(),
+                           false, false, false, 0);
+
+      // Call __tls_get_offset to retrieve the offset.
+      Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_GDCALL, Offset);
+      break;
+    }
+
+    case TLSModel::LocalDynamic: {
+      // Load the GOT offset of the module ID.
+      SystemZConstantPoolValue *CPV =
+        SystemZConstantPoolValue::Create(GV, SystemZCP::TLSLDM);
+
+      Offset = DAG.getConstantPool(CPV, PtrVT, 8);
+      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
+                           Offset, MachinePointerInfo::getConstantPool(),
+                           false, false, false, 0);
+
+      // Call __tls_get_offset to retrieve the module base offset.
+      Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_LDCALL, Offset);
+
+      // Note: The SystemZLDCleanupPass will remove redundant computations
+      // of the module base offset.  Count total number of local-dynamic
+      // accesses to trigger execution of that pass.
+      SystemZMachineFunctionInfo* MFI =
+        DAG.getMachineFunction().getInfo<SystemZMachineFunctionInfo>();
+      MFI->incNumLocalDynamicTLSAccesses();
+
+      // Add the per-symbol offset.
+      CPV = SystemZConstantPoolValue::Create(GV, SystemZCP::DTPOFF);
+
+      SDValue DTPOffset = DAG.getConstantPool(CPV, PtrVT, 8);
+      DTPOffset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
+                              DTPOffset, MachinePointerInfo::getConstantPool(),
+                              false, false, false, 0);
+
+      Offset = DAG.getNode(ISD::ADD, DL, PtrVT, Offset, DTPOffset);
+      break;
+    }
+
+    case TLSModel::InitialExec: {
+      // Load the offset from the GOT.
+      Offset = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                          SystemZII::MO_INDNTPOFF);
+      Offset = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Offset);
+      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
+                           Offset, MachinePointerInfo::getGOT(),
+                           false, false, false, 0);
+      break;
+    }
  
-  // Force the offset into the constant pool and load it from there.
-  SDValue CPAddr = DAG.getConstantPool(CPV, PtrVT, 8);
-  SDValue Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
-                              CPAddr, MachinePointerInfo::getConstantPool(),
-                              false, false, false, 0);
+    case TLSModel::LocalExec: {
+      // Force the offset into the constant pool and load it from there.
+      SystemZConstantPoolValue *CPV =
+        SystemZConstantPoolValue::Create(GV, SystemZCP::NTPOFF);
+
+      Offset = DAG.getConstantPool(CPV, PtrVT, 8);
+      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
+                           Offset, MachinePointerInfo::getConstantPool(),
+                           false, false, false, 0);
+      break;
+    }
+  }
  
    // Add the base and offset together.
    return DAG.getNode(ISD::ADD, DL, PtrVT, TP, Offset);
@@ -2125,8 +2259,8 @@ SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
    // Get the known-zero masks for each operand.
    SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) };
    APInt KnownZero[2], KnownOne[2];
-  DAG.ComputeMaskedBits(Ops[0], KnownZero[0], KnownOne[0]);
-  DAG.ComputeMaskedBits(Ops[1], KnownZero[1], KnownOne[1]);
+  DAG.computeKnownBits(Ops[0], KnownZero[0], KnownOne[0]);
+  DAG.computeKnownBits(Ops[1], KnownZero[1], KnownOne[1]);
  
    // See if the upper 32 bits of one operand and the lower 32 bits of the
    // other are known zero.  They are the low and high operands respectively.
@@ -2175,6 +2309,45 @@ SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
                                     MVT::i64, HighOp, Low32);
  }
  
+SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  int64_t OrigBitSize = VT.getSizeInBits();
+  SDLoc DL(Op);
+
+  // Get the known-zero mask for the operand.
+  Op = Op.getOperand(0);
+  APInt KnownZero, KnownOne;
+  DAG.computeKnownBits(Op, KnownZero, KnownOne);
+  uint64_t Mask = ~KnownZero.getZExtValue();
+
+  // Skip known-zero high parts of the operand.
+  int64_t BitSize = OrigBitSize;
+  while ((Mask & ((((uint64_t)1 << (BitSize / 2)) - 1) << (BitSize / 2))) == 0)
+    BitSize = BitSize / 2;
+
+  // The POPCNT instruction counts the number of bits in each byte.
+  Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op);
+  Op = DAG.getNode(SystemZISD::POPCNT, DL, MVT::i64, Op);
+  Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
+
+  // Add up per-byte counts in a binary tree.  All bits of Op at
+  // position larger than BitSize remain zero throughout.
+  for (int64_t I = BitSize / 2; I >= 8; I = I / 2) {
+    SDValue Tmp = DAG.getNode(ISD::SHL, DL, VT, Op, DAG.getConstant(I, VT));
+    if (BitSize != OrigBitSize)
+      Tmp = DAG.getNode(ISD::AND, DL, VT, Tmp,
+                        DAG.getConstant(((uint64_t)1 << BitSize) - 1, VT));
+    Op = DAG.getNode(ISD::ADD, DL, VT, Op, Tmp);
+  }
+
+  // Extract overall result from high byte.
+  if (BitSize > 8)
+    Op = DAG.getNode(ISD::SRL, DL, VT, Op, DAG.getConstant(BitSize - 8, VT));
+
+  return Op;
+}
+
  // Op is an atomic load.  Lower it into a normal volatile load.
  SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
                                                  SelectionDAG &DAG) const {
@@ -2287,9 +2460,9 @@ SDValue SystemZTargetLowering::lowerATOMIC_LOAD_SUB(SDValue Op,
        // Use an addition if the operand is constant and either LAA(G) is
        // available or the negative value is in the range of A(G)FHI.
        int64_t Value = (-Op2->getAPIntValue()).getSExtValue();
-      if (isInt<32>(Value) || TM.getSubtargetImpl()->hasInterlockedAccess1())
+      if (isInt<32>(Value) || Subtarget.hasInterlockedAccess1())
          NegSrc2 = DAG.getConstant(Value, MemVT);
-    } else if (TM.getSubtargetImpl()->hasInterlockedAccess1())
+    } else if (Subtarget.hasInterlockedAccess1())
        // Use LAA(G) if available.
        NegSrc2 = DAG.getNode(ISD::SUB, DL, MemVT, DAG.getConstant(0, MemVT),
                              Src2);
@@ -2425,6 +2598,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
      return lowerUDIVREM(Op, DAG);
    case ISD::OR:
      return lowerOR(Op, DAG);
+  case ISD::CTPOP:
+    return lowerCTPOP(Op, DAG);
    case ISD::ATOMIC_SWAP:
      return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW);
    case ISD::ATOMIC_STORE:
@@ -2602,7 +2777,8 @@ static unsigned forceReg(MachineInstr *MI, MachineOperand &Base,
  MachineBasicBlock *
  SystemZTargetLowering::emitSelect(MachineInstr *MI,
                                    MachineBasicBlock *MBB) const {
-  const SystemZInstrInfo *TII = TM.getInstrInfo();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
  
    unsigned DestReg  = MI->getOperand(0).getReg();
    unsigned TrueReg  = MI->getOperand(1).getReg();
@@ -2650,7 +2826,8 @@ SystemZTargetLowering::emitCondStore(MachineInstr *MI,
                                       MachineBasicBlock *MBB,
                                       unsigned StoreOpcode, unsigned STOCOpcode,
                                       bool Invert) const {
-  const SystemZInstrInfo *TII = TM.getInstrInfo();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
  
    unsigned SrcReg     = MI->getOperand(0).getReg();
    MachineOperand Base = MI->getOperand(1);
@@ -2665,7 +2842,7 @@ SystemZTargetLowering::emitCondStore(MachineInstr *MI,
    // Use STOCOpcode if possible.  We could use different store patterns in
    // order to avoid matching the index register, but the performance trade-offs
    // might be more complicated in that case.
-  if (STOCOpcode && !IndexReg && TM.getSubtargetImpl()->hasLoadStoreOnCond()) {
+  if (STOCOpcode && !IndexReg && Subtarget.hasLoadStoreOnCond()) {
      if (Invert)
        CCMask ^= CCValid;
      BuildMI(*MBB, MI, DL, TII->get(STOCOpcode))
@@ -2717,8 +2894,9 @@ SystemZTargetLowering::emitAtomicLoadBinary(MachineInstr *MI,
                                              unsigned BinOpcode,
                                              unsigned BitSize,
                                              bool Invert) const {
-  const SystemZInstrInfo *TII = TM.getInstrInfo();
    MachineFunction &MF = *MBB->getParent();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
    MachineRegisterInfo &MRI = MF.getRegInfo();
    bool IsSubWord = (BitSize < 32);
  
@@ -2788,14 +2966,10 @@ SystemZTargetLowering::emitAtomicLoadBinary(MachineInstr *MI,
      unsigned Tmp = MRI.createVirtualRegister(RC);
      BuildMI(MBB, DL, TII->get(BinOpcode), Tmp)
        .addReg(RotatedOldVal).addOperand(Src2);
-    if (BitSize < 32)
+    if (BitSize <= 32)
        // XILF with the upper BitSize bits set.
        BuildMI(MBB, DL, TII->get(SystemZ::XILF), RotatedNewVal)
-        .addReg(Tmp).addImm(uint32_t(~0 << (32 - BitSize)));
-    else if (BitSize == 32)
-      // XILF with every bit set.
-      BuildMI(MBB, DL, TII->get(SystemZ::XILF), RotatedNewVal)
-        .addReg(Tmp).addImm(~uint32_t(0));
+        .addReg(Tmp).addImm(-1U << (32 - BitSize));
      else {
        // Use LCGR and add -1 to the result, which is more compact than
        // an XILF, XILH pair.
@@ -2840,8 +3014,9 @@ SystemZTargetLowering::emitAtomicLoadMinMax(MachineInstr *MI,
                                              unsigned CompareOpcode,
                                              unsigned KeepOldMask,
                                              unsigned BitSize) const {
-  const SystemZInstrInfo *TII = TM.getInstrInfo();
    MachineFunction &MF = *MBB->getParent();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
    MachineRegisterInfo &MRI = MF.getRegInfo();
    bool IsSubWord = (BitSize < 32);
  
@@ -2951,8 +3126,9 @@ SystemZTargetLowering::emitAtomicLoadMinMax(MachineInstr *MI,
  MachineBasicBlock *
  SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr *MI,
                                            MachineBasicBlock *MBB) const {
-  const SystemZInstrInfo *TII = TM.getInstrInfo();
    MachineFunction &MF = *MBB->getParent();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
    MachineRegisterInfo &MRI = MF.getRegInfo();
  
    // Extract the operands.  Base can be a register or a frame index.
@@ -3067,8 +3243,9 @@ MachineBasicBlock *
  SystemZTargetLowering::emitExt128(MachineInstr *MI,
                                    MachineBasicBlock *MBB,
                                    bool ClearEven, unsigned SubReg) const {
-  const SystemZInstrInfo *TII = TM.getInstrInfo();
    MachineFunction &MF = *MBB->getParent();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
    MachineRegisterInfo &MRI = MF.getRegInfo();
    DebugLoc DL = MI->getDebugLoc();
  
@@ -3098,8 +3275,9 @@ MachineBasicBlock *
  SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI,
                                           MachineBasicBlock *MBB,
                                           unsigned Opcode) const {
-  const SystemZInstrInfo *TII = TM.getInstrInfo();
    MachineFunction &MF = *MBB->getParent();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
    MachineRegisterInfo &MRI = MF.getRegInfo();
    DebugLoc DL = MI->getDebugLoc();
  
@@ -3267,8 +3445,9 @@ MachineBasicBlock *
  SystemZTargetLowering::emitStringWrapper(MachineInstr *MI,
                                           MachineBasicBlock *MBB,
                                           unsigned Opcode) const {
-  const SystemZInstrInfo *TII = TM.getInstrInfo();
    MachineFunction &MF = *MBB->getParent();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
    MachineRegisterInfo &MRI = MF.getRegInfo();
    DebugLoc DL = MI->getDebugLoc();