X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FPowerPC%2FPPCISelLowering.cpp;h=0765638dce2252d9bd7278e489bc6ea7b8dc9a12;hb=7b93acde3206e916752d94288f25ae5f08a4d419;hp=06cf9af80b6a4df720a8d1988f21d0b7d7a1e0bb;hpb=2bea9472076899721460b7aa106de2f6e21cef58;p=oota-llvm.git diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 06cf9af80b6..0765638dce2 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -13,6 +13,7 @@ #include "PPCISelLowering.h" #include "MCTargetDesc/PPCPredicates.h" +#include "PPCCallingConv.h" #include "PPCMachineFunctionInfo.h" #include "PPCPerfectShuffle.h" #include "PPCTargetMachine.h" @@ -24,6 +25,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" @@ -55,9 +57,9 @@ cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden); // FIXME: Remove this once the bug has been fixed! extern cl::opt ANDIGlueBug; -PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM) - : TargetLowering(TM), - Subtarget(*TM.getSubtargetImpl()) { +PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, + const PPCSubtarget &STI) + : TargetLowering(TM), Subtarget(STI) { // Use _setjmp/_longjmp instead of setjmp/longjmp. setUseUnderscoreSetJmp(true); setUseUnderscoreLongJmp(true); @@ -73,8 +75,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM) addRegisterClass(MVT::f64, &PPC::F8RCRegClass); // PowerPC has an i16 but no i8 (or i1) SEXTLOAD - setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); - setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Expand); + for (MVT VT : MVT::integer_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand); + } setTruncStoreAction(MVT::f64, MVT::f32, Expand); @@ -84,11 +88,15 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM) setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal); setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal); setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal); + setIndexedLoadAction(ISD::PRE_INC, MVT::f32, Legal); + setIndexedLoadAction(ISD::PRE_INC, MVT::f64, Legal); setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal); setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal); setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal); setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal); setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal); + setIndexedStoreAction(ISD::PRE_INC, MVT::f32, Legal); + setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal); if (Subtarget.useCRBits()) { setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); @@ -113,12 +121,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM) if (ANDIGlueBug) setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); - setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); - setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote); - setTruncStoreAction(MVT::i64, MVT::i1, Expand); - setTruncStoreAction(MVT::i32, MVT::i1, Expand); - setTruncStoreAction(MVT::i16, MVT::i1, Expand); - setTruncStoreAction(MVT::i8, MVT::i1, Expand); + for (MVT VT : MVT::integer_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); + setTruncStoreAction(VT, MVT::i1, Expand); + } addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass); } @@ -169,13 +176,13 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM) // If we're enabling GP optimizations, use hardware square root if (!Subtarget.hasFSQRT() && - !(TM.Options.UnsafeFPMath && - Subtarget.hasFRSQRTE() && Subtarget.hasFRE())) + !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() && + Subtarget.hasFRE())) setOperationAction(ISD::FSQRT, MVT::f64, Expand); if (!Subtarget.hasFSQRT() && - !(TM.Options.UnsafeFPMath && - Subtarget.hasFRSQRTES() && Subtarget.hasFRES())) + !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() && + Subtarget.hasFRES())) setOperationAction(ISD::FSQRT, MVT::f32, Expand); if (Subtarget.hasFCPSGN()) { @@ -393,14 +400,21 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM) if (Subtarget.hasAltivec()) { // First set operation action for all vector types to expand. Then we // will selectively turn on ones that can be effectively codegen'd. - for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; - i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) { - MVT::SimpleValueType VT = (MVT::SimpleValueType)i; - + for (MVT VT : MVT::vector_valuetypes()) { // add/sub are legal for all supported vector VT's. setOperationAction(ISD::ADD , VT, Legal); setOperationAction(ISD::SUB , VT, Legal); + // Vector instructions introduced in P8 + if (Subtarget.hasP8Altivec()) { + setOperationAction(ISD::CTPOP, VT, Legal); + setOperationAction(ISD::CTLZ, VT, Legal); + } + else { + setOperationAction(ISD::CTPOP, VT, Expand); + setOperationAction(ISD::CTLZ, VT, Expand); + } + // We promote all shuffles to v16i8. setOperationAction(ISD::VECTOR_SHUFFLE, VT, Promote); AddPromotedToType (ISD::VECTOR_SHUFFLE, VT, MVT::v16i8); @@ -455,22 +469,18 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM) setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); - setOperationAction(ISD::CTPOP, VT, Expand); - setOperationAction(ISD::CTLZ, VT, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::CTTZ, VT, Expand); setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); - for (unsigned j = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; - j <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++j) { - MVT::SimpleValueType InnerVT = (MVT::SimpleValueType)j; + for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); } - setLoadExtAction(ISD::SEXTLOAD, VT, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, Expand); - setLoadExtAction(ISD::EXTLOAD, VT, Expand); } // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle @@ -595,6 +605,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM) addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass); } + + if (Subtarget.hasP8Altivec()) + addRegisterClass(MVT::v2i64, &PPC::VRRCRegClass); } if (Subtarget.has64BitSupport()) @@ -630,6 +643,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM) // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::SINT_TO_FP); + if (Subtarget.hasFPCVT()) + setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::BR_CC); @@ -672,13 +687,33 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM) // With 32 condition bits, we don't need to sink (and duplicate) compares // aggressively in CodeGenPrep. - if (Subtarget.useCRBits()) + if (Subtarget.useCRBits()) { setHasMultipleConditionRegisters(); + setJumpIsExpensive(); + } setMinFunctionAlignment(2); if (Subtarget.isDarwin()) setPrefFunctionAlignment(4); + switch (Subtarget.getDarwinDirective()) { + default: break; + case PPC::DIR_970: + case PPC::DIR_A2: + case PPC::DIR_E500mc: + case PPC::DIR_E5500: + case PPC::DIR_PWR4: + case PPC::DIR_PWR5: + case PPC::DIR_PWR5X: + case PPC::DIR_PWR6: + case PPC::DIR_PWR6X: + case PPC::DIR_PWR7: + case PPC::DIR_PWR8: + setPrefFunctionAlignment(4); + setPrefLoopAlignment(4); + break; + } + setInsertFencesForAtomic(true); if (Subtarget.enableMachineScheduler()) @@ -688,8 +723,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM) computeRegisterProperties(); - // The Freescale cores does better with aggressive inlining of memcpy and - // friends. Gcc uses same threshold of 128 bytes (= 32 word stores). + // The Freescale cores do better with aggressive inlining of memcpy and + // friends. GCC uses same threshold of 128 bytes (= 32 word stores). if (Subtarget.getDarwinDirective() == PPC::DIR_E500mc || Subtarget.getDarwinDirective() == PPC::DIR_E5500) { MaxStoresPerMemset = 32; @@ -698,8 +733,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM) MaxStoresPerMemcpyOptSize = 8; MaxStoresPerMemmove = 32; MaxStoresPerMemmoveOptSize = 8; - - setPrefFunctionAlignment(4); } } @@ -751,19 +784,23 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { default: return nullptr; case PPCISD::FSEL: return "PPCISD::FSEL"; case PPCISD::FCFID: return "PPCISD::FCFID"; + case PPCISD::FCFIDU: return "PPCISD::FCFIDU"; + case PPCISD::FCFIDS: return "PPCISD::FCFIDS"; + case PPCISD::FCFIDUS: return "PPCISD::FCFIDUS"; case PPCISD::FCTIDZ: return "PPCISD::FCTIDZ"; case PPCISD::FCTIWZ: return "PPCISD::FCTIWZ"; + case PPCISD::FCTIDUZ: return "PPCISD::FCTIDUZ"; + case PPCISD::FCTIWUZ: return "PPCISD::FCTIWUZ"; case PPCISD::FRE: return "PPCISD::FRE"; case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE"; case PPCISD::STFIWX: return "PPCISD::STFIWX"; case PPCISD::VMADDFP: return "PPCISD::VMADDFP"; case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP"; case PPCISD::VPERM: return "PPCISD::VPERM"; + case PPCISD::CMPB: return "PPCISD::CMPB"; case PPCISD::Hi: return "PPCISD::Hi"; case PPCISD::Lo: return "PPCISD::Lo"; case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY"; - case PPCISD::LOAD: return "PPCISD::LOAD"; - case PPCISD::LOAD_TOC: return "PPCISD::LOAD_TOC"; case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC"; case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg"; case PPCISD::SRL: return "PPCISD::SRL"; @@ -771,10 +808,9 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::SHL: return "PPCISD::SHL"; case PPCISD::CALL: return "PPCISD::CALL"; case PPCISD::CALL_NOP: return "PPCISD::CALL_NOP"; - case PPCISD::CALL_TLS: return "PPCISD::CALL_TLS"; - case PPCISD::CALL_NOP_TLS: return "PPCISD::CALL_NOP_TLS"; case PPCISD::MTCTR: return "PPCISD::MTCTR"; case PPCISD::BCTRL: return "PPCISD::BCTRL"; + case PPCISD::BCTRL_LOAD_TOC: return "PPCISD::BCTRL_LOAD_TOC"; case PPCISD::RET_FLAG: return "PPCISD::RET_FLAG"; case PPCISD::READ_TIME_BASE: return "PPCISD::READ_TIME_BASE"; case PPCISD::EH_SJLJ_SETJMP: return "PPCISD::EH_SJLJ_SETJMP"; @@ -784,6 +820,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::VCMPo: return "PPCISD::VCMPo"; case PPCISD::LBRX: return "PPCISD::LBRX"; case PPCISD::STBRX: return "PPCISD::STBRX"; + case PPCISD::LFIWAX: return "PPCISD::LFIWAX"; + case PPCISD::LFIWZX: return "PPCISD::LFIWZX"; case PPCISD::LARX: return "PPCISD::LARX"; case PPCISD::STCX: return "PPCISD::STCX"; case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH"; @@ -803,8 +841,12 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::ADD_TLS: return "PPCISD::ADD_TLS"; case PPCISD::ADDIS_TLSGD_HA: return "PPCISD::ADDIS_TLSGD_HA"; case PPCISD::ADDI_TLSGD_L: return "PPCISD::ADDI_TLSGD_L"; + case PPCISD::GET_TLS_ADDR: return "PPCISD::GET_TLS_ADDR"; + case PPCISD::ADDI_TLSGD_L_ADDR: return "PPCISD::ADDI_TLSGD_L_ADDR"; case PPCISD::ADDIS_TLSLD_HA: return "PPCISD::ADDIS_TLSLD_HA"; case PPCISD::ADDI_TLSLD_L: return "PPCISD::ADDI_TLSLD_L"; + case PPCISD::GET_TLSLD_ADDR: return "PPCISD::GET_TLSLD_ADDR"; + case PPCISD::ADDI_TLSLD_L_ADDR: return "PPCISD::ADDI_TLSLD_L_ADDR"; case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA"; case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L"; case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT"; @@ -854,7 +896,7 @@ static bool isConstantOrUndef(int Op, int Val) { /// For the latter, the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, SelectionDAG &DAG) { - bool IsLE = DAG.getSubtarget().getDataLayout()->isLittleEndian(); + bool IsLE = DAG.getTarget().getDataLayout()->isLittleEndian(); if (ShuffleKind == 0) { if (IsLE) return false; @@ -885,7 +927,7 @@ bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, /// For the latter, the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, SelectionDAG &DAG) { - bool IsLE = DAG.getSubtarget().getDataLayout()->isLittleEndian(); + bool IsLE = DAG.getTarget().getDataLayout()->isLittleEndian(); if (ShuffleKind == 0) { if (IsLE) return false; @@ -940,7 +982,7 @@ static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize, /// the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, unsigned ShuffleKind, SelectionDAG &DAG) { - if (DAG.getSubtarget().getDataLayout()->isLittleEndian()) { + if (DAG.getTarget().getDataLayout()->isLittleEndian()) { if (ShuffleKind == 1) // unary return isVMerge(N, UnitSize, 0, 0); else if (ShuffleKind == 2) // swapped @@ -965,7 +1007,7 @@ bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, /// the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, unsigned ShuffleKind, SelectionDAG &DAG) { - if (DAG.getSubtarget().getDataLayout()->isLittleEndian()) { + if (DAG.getTarget().getDataLayout()->isLittleEndian()) { if (ShuffleKind == 1) // unary return isVMerge(N, UnitSize, 8, 8); else if (ShuffleKind == 2) // swapped @@ -1009,8 +1051,7 @@ int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind, if (ShiftAmt < i) return -1; ShiftAmt -= i; - bool isLE = DAG.getTarget().getSubtargetImpl()->getDataLayout()-> - isLittleEndian(); + bool isLE = DAG.getTarget().getDataLayout()->isLittleEndian(); if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) { // Check the rest of the elements to see if they are consecutive. @@ -1083,7 +1124,7 @@ unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize, SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast(N); assert(isSplatShuffleMask(SVOp, EltSize)); - if (DAG.getSubtarget().getDataLayout()->isLittleEndian()) + if (DAG.getTarget().getDataLayout()->isLittleEndian()) return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize); else return SVOp->getMaskElt(0) / EltSize; @@ -1519,8 +1560,9 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, /// GetLabelAccessInfo - Return true if we should reference labels using a /// PICBase, set the HiOpFlags and LoOpFlags to the target MO flags. -static bool GetLabelAccessInfo(const TargetMachine &TM, unsigned &HiOpFlags, - unsigned &LoOpFlags, +static bool GetLabelAccessInfo(const TargetMachine &TM, + const PPCSubtarget &Subtarget, + unsigned &HiOpFlags, unsigned &LoOpFlags, const GlobalValue *GV = nullptr) { HiOpFlags = PPCII::MO_HA; LoOpFlags = PPCII::MO_LO; @@ -1535,7 +1577,7 @@ static bool GetLabelAccessInfo(const TargetMachine &TM, unsigned &HiOpFlags, // If this is a reference to a global value that requires a non-lazy-ptr, make // sure that instruction lowering adds it. - if (GV && TM.getSubtarget().hasLazyResolverStub(GV, TM)) { + if (GV && Subtarget.hasLazyResolverStub(GV)) { HiOpFlags |= PPCII::MO_NLP_FLAG; LoOpFlags |= PPCII::MO_NLP_FLAG; @@ -1567,6 +1609,15 @@ static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC, return DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo); } +static void setUsesTOCBasePtr(MachineFunction &MF) { + PPCFunctionInfo *FuncInfo = MF.getInfo(); + FuncInfo->setUsesTOCBasePtr(); +} + +static void setUsesTOCBasePtr(SelectionDAG &DAG) { + setUsesTOCBasePtr(DAG.getMachineFunction()); +} + SDValue PPCTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = Op.getValueType(); @@ -1576,13 +1627,15 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op, // 64-bit SVR4 ABI code is always position-independent. // The actual address of the GlobalValue is stored in the TOC. if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { + setUsesTOCBasePtr(DAG); SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0); return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(CP), MVT::i64, GA, DAG.getRegister(PPC::X2, MVT::i64)); } unsigned MOHiFlag, MOLoFlag; - bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag); + bool isPIC = + GetLabelAccessInfo(DAG.getTarget(), Subtarget, MOHiFlag, MOLoFlag); if (isPIC && Subtarget.isSVR4ABI()) { SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), @@ -1606,13 +1659,15 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { // 64-bit SVR4 ABI code is always position-independent. // The actual address of the GlobalValue is stored in the TOC. if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { + setUsesTOCBasePtr(DAG); SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT); return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(JT), MVT::i64, GA, DAG.getRegister(PPC::X2, MVT::i64)); } unsigned MOHiFlag, MOLoFlag; - bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag); + bool isPIC = + GetLabelAccessInfo(DAG.getTarget(), Subtarget, MOHiFlag, MOLoFlag); if (isPIC && Subtarget.isSVR4ABI()) { SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, @@ -1636,39 +1691,20 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op, // 64-bit SVR4 ABI code is always position-independent. // The actual BlockAddress is stored in the TOC. if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { + setUsesTOCBasePtr(DAG); SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset()); return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(BASDN), MVT::i64, GA, DAG.getRegister(PPC::X2, MVT::i64)); } unsigned MOHiFlag, MOLoFlag; - bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag); + bool isPIC = + GetLabelAccessInfo(DAG.getTarget(), Subtarget, MOHiFlag, MOLoFlag); SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag); SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag); return LowerLabelRef(TgtBAHi, TgtBALo, isPIC, DAG); } -// Generate a call to __tls_get_addr for the given GOT entry Op. -std::pair -PPCTargetLowering::lowerTLSCall(SDValue Op, SDLoc dl, - SelectionDAG &DAG) const { - - Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext()); - TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - Entry.Node = Op; - Entry.Ty = IntPtrTy; - Args.push_back(Entry); - - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(CallingConv::C, IntPtrTy, - DAG.getTargetExternalSymbol("__tls_get_addr", getPointerTy()), - std::move(Args), 0); - - return LowerCallTo(CLI); -} - SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { @@ -1703,6 +1739,7 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, PPCII::MO_TLS); SDValue GOTPtr; if (is64bit) { + setUsesTOCBasePtr(DAG); SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl, PtrVT, GOTReg, TGA); @@ -1714,10 +1751,10 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, } if (Model == TLSModel::GeneralDynamic) { - SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, - PPCII::MO_TLSGD); + SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); SDValue GOTPtr; if (is64bit) { + setUsesTOCBasePtr(DAG); SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT, GOTReg, TGA); @@ -1727,17 +1764,15 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, else GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); } - SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSGD_L, dl, PtrVT, - GOTPtr, TGA); - std::pair CallResult = lowerTLSCall(GOTEntry, dl, DAG); - return CallResult.first; + return DAG.getNode(PPCISD::ADDI_TLSGD_L_ADDR, dl, PtrVT, + GOTPtr, TGA, TGA); } if (Model == TLSModel::LocalDynamic) { - SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, - PPCII::MO_TLSLD); + SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); SDValue GOTPtr; if (is64bit) { + setUsesTOCBasePtr(DAG); SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT, GOTReg, TGA); @@ -1747,13 +1782,10 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, else GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); } - SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSLD_L, dl, PtrVT, - GOTPtr, TGA); - std::pair CallResult = lowerTLSCall(GOTEntry, dl, DAG); - SDValue TLSAddr = CallResult.first; - SDValue Chain = CallResult.second; - SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl, PtrVT, - Chain, TLSAddr, TGA); + SDValue TLSAddr = DAG.getNode(PPCISD::ADDI_TLSLD_L_ADDR, dl, + PtrVT, GOTPtr, TGA, TGA); + SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl, + PtrVT, TLSAddr, TGA); return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA); } @@ -1770,13 +1802,15 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op, // 64-bit SVR4 ABI code is always position-independent. // The actual address of the GlobalValue is stored in the TOC. if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { + setUsesTOCBasePtr(DAG); SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset()); return DAG.getNode(PPCISD::TOC_ENTRY, DL, MVT::i64, GA, DAG.getRegister(PPC::X2, MVT::i64)); } unsigned MOHiFlag, MOLoFlag; - bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag, GV); + bool isPIC = + GetLabelAccessInfo(DAG.getTarget(), Subtarget, MOHiFlag, MOLoFlag, GV); if (isPIC && Subtarget.isSVR4ABI()) { SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, @@ -2315,10 +2349,9 @@ static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, /// EnsureStackAlignment - Round stack frame size up from NumBytes to /// ensure minimum alignment required for target. -static unsigned EnsureStackAlignment(const TargetMachine &Target, +static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering, unsigned NumBytes) { - unsigned TargetAlign = - Target.getSubtargetImpl()->getFrameLowering()->getStackAlignment(); + unsigned TargetAlign = Lowering->getStackAlignment(); unsigned AlignMask = TargetAlign - 1; NumBytes = (NumBytes + AlignMask) & ~AlignMask; return NumBytes; @@ -2399,7 +2432,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( *DAG.getContext()); // Reserve space for the linkage area on the stack. - unsigned LinkageSize = PPCFrameLowering::getLinkageSize(false, false, false); + unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); CCInfo.AllocateStack(LinkageSize, PtrByteSize); CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4); @@ -2485,7 +2518,8 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( // call optimized function's reserved stack space needs to be aligned so that // taking the difference between two stack areas will result in an aligned // stack. - MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea); + MinReservedArea = + EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea); FuncInfo->setMinReservedArea(MinReservedArea); SmallVector MemOps; @@ -2600,14 +2634,15 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( MachineFrameInfo *MFI = MF.getFrameInfo(); PPCFunctionInfo *FuncInfo = MF.getInfo(); + assert(!(CallConv == CallingConv::Fast && isVarArg) && + "fastcc not supported on varargs functions"); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); // Potential tail calls could cause overwriting of argument stack slots. bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && (CallConv == CallingConv::Fast)); unsigned PtrByteSize = 8; - - unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false, - isELFv2ABI); + unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); static const MCPhysReg GPR[] = { PPC::X3, PPC::X4, PPC::X5, PPC::X6, @@ -2651,7 +2686,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( // although the first ones are often in registers. unsigned ArgOffset = LinkageSize; - unsigned GPR_idx, FPR_idx = 0, VR_idx = 0; + unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; SmallVector MemOps; Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin(); unsigned CurArgIdx = 0; @@ -2666,19 +2701,31 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( std::advance(FuncArg, Ins[ArgNo].OrigArgIndex - CurArgIdx); CurArgIdx = Ins[ArgNo].OrigArgIndex; - /* Respect alignment of argument on the stack. */ - unsigned Align = - CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize); - ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; - unsigned CurArgOffset = ArgOffset; + // We re-align the argument offset for each argument, except when using the + // fast calling convention, when we need to make sure we do that only when + // we'll actually use a stack slot. + unsigned CurArgOffset, Align; + auto ComputeArgOffset = [&]() { + /* Respect alignment of argument on the stack. */ + Align = CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize); + ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; + CurArgOffset = ArgOffset; + }; - /* Compute GPR index associated with argument offset. */ - GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; - GPR_idx = std::min(GPR_idx, Num_GPR_Regs); + if (CallConv != CallingConv::Fast) { + ComputeArgOffset(); + + /* Compute GPR index associated with argument offset. */ + GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; + GPR_idx = std::min(GPR_idx, Num_GPR_Regs); + } // FIXME the codegen can be much improved in some cases. // We do not have to keep everything in memory. if (Flags.isByVal()) { + if (CallConv == CallingConv::Fast) + ComputeArgOffset(); + // ObjSize is the true size, ArgSize rounded up to multiple of registers. ObjSize = Flags.getByValSize(); ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; @@ -2722,7 +2769,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( InVals.push_back(Arg); if (GPR_idx != Num_GPR_Regs) { - unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); SDValue Store; @@ -2784,7 +2831,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( // passed directly. Clang may use those instead of "byval" aggregate // types to avoid forcing arguments to memory unnecessarily. if (GPR_idx != Num_GPR_Regs) { - unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) @@ -2792,10 +2839,14 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( // value to MVT::i64 and then truncate to the correct register size. ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); } else { + if (CallConv == CallingConv::Fast) + ComputeArgOffset(); + needsLoad = true; ArgSize = PtrByteSize; } - ArgOffset += 8; + if (CallConv != CallingConv::Fast || needsLoad) + ArgOffset += 8; break; case MVT::f32: @@ -2809,17 +2860,20 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( if (ObjectVT == MVT::f32) VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass); else - VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX() ? - &PPC::VSFRCRegClass : - &PPC::F8RCRegClass); + VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX() + ? &PPC::VSFRCRegClass + : &PPC::F8RCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); ++FPR_idx; - } else if (GPR_idx != Num_GPR_Regs) { + } else if (GPR_idx != Num_GPR_Regs && CallConv != CallingConv::Fast) { + // FIXME: We may want to re-enable this for CallingConv::Fast on the P8 + // once we support fp <-> gpr moves. + // This can only ever happen in the presence of f32 array types, // since otherwise we never run out of FPRs before running out // of GPRs. - unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); if (ObjectVT == MVT::f32) { @@ -2831,16 +2885,21 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal); } else { + if (CallConv == CallingConv::Fast) + ComputeArgOffset(); + needsLoad = true; } // When passing an array of floats, the array occupies consecutive // space in the argument area; only round up to the next doubleword // at the end of the array. Otherwise, each float takes 8 bytes. - ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize; - ArgOffset += ArgSize; - if (Flags.isInConsecutiveRegsLast()) - ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + if (CallConv != CallingConv::Fast || needsLoad) { + ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize; + ArgOffset += ArgSize; + if (Flags.isInConsecutiveRegsLast()) + ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + } break; case MVT::v4f32: case MVT::v4i32: @@ -2858,9 +2917,13 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); ++VR_idx; } else { + if (CallConv == CallingConv::Fast) + ComputeArgOffset(); + needsLoad = true; } - ArgOffset += 16; + if (CallConv != CallingConv::Fast || needsLoad) + ArgOffset += 16; break; } @@ -2889,7 +2952,8 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( // call optimized functions' reserved stack space needs to be aligned so that // taking the difference between two stack areas will result in an aligned // stack. - MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea); + MinReservedArea = + EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea); FuncInfo->setMinReservedArea(MinReservedArea); // If the function takes variable number of arguments, make a frame index for @@ -2943,9 +3007,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin( bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && (CallConv == CallingConv::Fast)); unsigned PtrByteSize = isPPC64 ? 8 : 4; - - unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true, - false); + unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); unsigned ArgOffset = LinkageSize; // Area that is at least reserved in caller of this function. unsigned MinReservedArea = ArgOffset; @@ -3250,7 +3312,8 @@ PPCTargetLowering::LowerFormalArguments_Darwin( // call optimized functions' reserved stack space needs to be aligned so that // taking the difference between two stack areas will result in an aligned // stack. - MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea); + MinReservedArea = + EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea); FuncInfo->setMinReservedArea(MinReservedArea); // If the function takes variable number of arguments, make a frame index for @@ -3405,8 +3468,9 @@ static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, if (SPDiff) { // Calculate the new stack slot for the return address. int SlotSize = isPPC64 ? 8 : 4; - int NewRetAddrLoc = SPDiff + PPCFrameLowering::getReturnSaveOffset(isPPC64, - isDarwinABI); + const PPCFrameLowering *FL = + MF.getSubtarget().getFrameLowering(); + int NewRetAddrLoc = SPDiff + FL->getReturnSaveOffset(); int NewRetAddr = MF.getFrameInfo()->CreateFixedObject(SlotSize, NewRetAddrLoc, true); EVT VT = isPPC64 ? MVT::i64 : MVT::i32; @@ -3418,8 +3482,7 @@ static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, // When using the 32/64-bit SVR4 ABI there is no need to move the FP stack // slot as the FP is never overwritten. if (isDarwinABI) { - int NewFPLoc = - SPDiff + PPCFrameLowering::getFramePointerSaveOffset(isPPC64, isDarwinABI); + int NewFPLoc = SPDiff + FL->getFramePointerSaveOffset(); int NewFPIdx = MF.getFrameInfo()->CreateFixedObject(SlotSize, NewFPLoc, true); SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT); @@ -3549,12 +3612,27 @@ void PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain, InFlag = Chain.getValue(1); } +// Is this global address that of a function that can be called by name? (as +// opposed to something that must hold a descriptor for an indirect call). +static bool isFunctionGlobalAddress(SDValue Callee) { + if (GlobalAddressSDNode *G = dyn_cast(Callee)) { + if (Callee.getOpcode() == ISD::GlobalTLSAddress || + Callee.getOpcode() == ISD::TargetGlobalTLSAddress) + return false; + + return G->getGlobal()->getType()->getElementType()->isFunctionTy(); + } + + return false; +} + static unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, - SDValue &Chain, SDLoc dl, int SPDiff, bool isTailCall, + SDValue &Chain, SDValue CallSeqStart, SDLoc dl, int SPDiff, + bool isTailCall, bool IsPatchPoint, SmallVectorImpl > &RegsToPass, SmallVectorImpl &Ops, std::vector &NodeTys, - const PPCSubtarget &Subtarget) { + ImmutableCallSite *CS, const PPCSubtarget &Subtarget) { bool isPPC64 = Subtarget.isPPC64(); bool isSVR4ABI = Subtarget.isSVR4ABI(); @@ -3574,7 +3652,10 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, needIndirectCall = false; } - if (GlobalAddressSDNode *G = dyn_cast(Callee)) { + if (isFunctionGlobalAddress(Callee)) { + GlobalAddressSDNode *G = cast(Callee); + // A call to a TLS address is actually an indirect call to a + // thread-specific pointer. unsigned OpFlags = 0; if ((DAG.getTarget().getRelocationModel() != Reloc::Static && (Subtarget.getTargetTriple().isMacOSX() && @@ -3605,7 +3686,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, (Subtarget.getTargetTriple().isMacOSX() && Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5))) || (Subtarget.isTargetELF() && !isPPC64 && - DAG.getTarget().getRelocationModel() == Reloc::PIC_) ) { + DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. @@ -3617,6 +3698,16 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, needIndirectCall = false; } + if (IsPatchPoint) { + // We'll form an invalid direct call when lowering a patchpoint; the full + // sequence for an indirect call is complicated, and many of the + // instructions introduced might have side effects (and, thus, can't be + // removed later). The call itself will be removed as soon as the + // argument/return lowering is complete, so the fact that it has the wrong + // kind of operands should not really matter. + needIndirectCall = false; + } + if (needIndirectCall) { // Otherwise, this is an indirect call. We have to use a MTCTR/BCTRL pair // to do the call, we can't use PPCISD::CALL. @@ -3642,50 +3733,51 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, // 6. On return of the callee, the TOC of the caller needs to be // restored (this is done in FinishCall()). // - // All those operations are flagged together to ensure that no other + // The loads are scheduled at the beginning of the call sequence, and the + // register copies are flagged together to ensure that no other // operations can be scheduled in between. E.g. without flagging the - // operations together, a TOC access in the caller could be scheduled - // between the load of the callee TOC and the branch to the callee, which + // copies together, a TOC access in the caller could be scheduled between + // the assignment of the callee TOC and the branch to the callee, which // results in the TOC access going through the TOC of the callee instead // of going through the TOC of the caller, which leads to incorrect code. // Load the address of the function entry point from the function // descriptor. - SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other, MVT::Glue); - SDValue LoadFuncPtr = DAG.getNode(PPCISD::LOAD, dl, VTs, - makeArrayRef(MTCTROps, InFlag.getNode() ? 3 : 2)); - Chain = LoadFuncPtr.getValue(1); - InFlag = LoadFuncPtr.getValue(2); + SDValue LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-1); + if (LDChain.getValueType() == MVT::Glue) + LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-2); + + bool LoadsInv = Subtarget.hasInvariantFunctionDescriptors(); + + MachinePointerInfo MPI(CS ? CS->getCalledValue() : nullptr); + SDValue LoadFuncPtr = DAG.getLoad(MVT::i64, dl, LDChain, Callee, MPI, + false, false, LoadsInv, 8); // Load environment pointer into r11. - // Offset of the environment pointer within the function descriptor. SDValue PtrOff = DAG.getIntPtrConstant(16); - SDValue AddPtr = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, PtrOff); - SDValue LoadEnvPtr = DAG.getNode(PPCISD::LOAD, dl, VTs, Chain, AddPtr, - InFlag); - Chain = LoadEnvPtr.getValue(1); - InFlag = LoadEnvPtr.getValue(2); + SDValue LoadEnvPtr = DAG.getLoad(MVT::i64, dl, LDChain, AddPtr, + MPI.getWithOffset(16), false, false, + LoadsInv, 8); + + SDValue TOCOff = DAG.getIntPtrConstant(8); + SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, TOCOff); + SDValue TOCPtr = DAG.getLoad(MVT::i64, dl, LDChain, AddTOC, + MPI.getWithOffset(8), false, false, + LoadsInv, 8); + + setUsesTOCBasePtr(DAG); + SDValue TOCVal = DAG.getCopyToReg(Chain, dl, PPC::X2, TOCPtr, + InFlag); + Chain = TOCVal.getValue(0); + InFlag = TOCVal.getValue(1); SDValue EnvVal = DAG.getCopyToReg(Chain, dl, PPC::X11, LoadEnvPtr, InFlag); + Chain = EnvVal.getValue(0); InFlag = EnvVal.getValue(1); - // Load TOC of the callee into r2. We are using a target-specific load - // with r2 hard coded, because the result of a target-independent load - // would never go directly into r2, since r2 is a reserved register (which - // prevents the register allocator from allocating it), resulting in an - // additional register being allocated and an unnecessary move instruction - // being generated. - VTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue TOCOff = DAG.getIntPtrConstant(8); - SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, TOCOff); - SDValue LoadTOCPtr = DAG.getNode(PPCISD::LOAD_TOC, dl, VTs, Chain, - AddTOC, InFlag); - Chain = LoadTOCPtr.getValue(0); - InFlag = LoadTOCPtr.getValue(1); - MTCTROps[0] = Chain; MTCTROps[1] = LoadFuncPtr; MTCTROps[2] = InFlag; @@ -3713,23 +3805,6 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, if (Callee.getNode()) { Ops.push_back(Chain); Ops.push_back(Callee); - - // If this is a call to __tls_get_addr, find the symbol whose address - // is to be taken and add it to the list. This will be used to - // generate __tls_get_addr(@tlsgd) or __tls_get_addr(@tlsld). - // We find the symbol by walking the chain to the CopyFromReg, walking - // back from the CopyFromReg to the ADDI_TLSGD_L or ADDI_TLSLD_L, and - // pulling the symbol from that node. - if (ExternalSymbolSDNode *S = dyn_cast(Callee)) - if (!strcmp(S->getSymbol(), "__tls_get_addr")) { - assert(!needIndirectCall && "Indirect call to __tls_get_addr???"); - SDNode *AddI = Chain.getNode()->getOperand(2).getNode(); - SDValue TGTAddr = AddI->getOperand(1); - assert(TGTAddr.getNode()->getOpcode() == ISD::TargetGlobalTLSAddress && - "Didn't find target global TLS address where we expected one"); - Ops.push_back(TGTAddr); - CallOpc = PPCISD::CALL_TLS; - } } // If this is a tail call add stack pointer delta. if (isTailCall) @@ -3741,9 +3816,12 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, Ops.push_back(DAG.getRegister(RegsToPass[i].first, RegsToPass[i].second.getValueType())); - // Direct calls in the ELFv2 ABI need the TOC register live into the call. - if (Callee.getNode() && isELFv2ABI) + // All calls, in both the ELF V1 and V2 ABIs, need the TOC register live + // into the call. + if (isSVR4ABI && isPPC64 && !IsPatchPoint) { + setUsesTOCBasePtr(DAG); Ops.push_back(DAG.getRegister(PPC::X2, PtrVT)); + } return CallOpc; } @@ -3805,22 +3883,22 @@ PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, SDValue PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, - bool isTailCall, bool isVarArg, + bool isTailCall, bool isVarArg, bool IsPatchPoint, SelectionDAG &DAG, SmallVector, 8> &RegsToPass, SDValue InFlag, SDValue Chain, - SDValue &Callee, + SDValue CallSeqStart, SDValue &Callee, int SPDiff, unsigned NumBytes, const SmallVectorImpl &Ins, - SmallVectorImpl &InVals) const { + SmallVectorImpl &InVals, + ImmutableCallSite *CS) const { - bool isELFv2ABI = Subtarget.isELFv2ABI(); std::vector NodeTys; SmallVector Ops; - unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, dl, SPDiff, - isTailCall, RegsToPass, Ops, NodeTys, - Subtarget); + unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl, + SPDiff, isTailCall, IsPatchPoint, RegsToPass, + Ops, NodeTys, CS, Subtarget); // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls if (isVarArg && Subtarget.isSVR4ABI() && !Subtarget.isPPC64()) @@ -3834,8 +3912,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0; // Add a register mask operand representing the call-preserved registers. - const TargetRegisterInfo *TRI = - getTargetMachine().getSubtargetImpl()->getRegisterInfo(); + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -3864,8 +3941,8 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, // stack frame. If caller and callee belong to the same module (and have the // same TOC), the NOP will remain unchanged. - bool needsTOCRestore = false; - if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64()) { + if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64() && + !IsPatchPoint) { if (CallOpc == PPCISD::BCTRL) { // This is a call through a function pointer. // Restore the caller TOC from the save area into R2. @@ -3876,31 +3953,27 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, // since r2 is a reserved register (which prevents the register allocator // from allocating it), resulting in an additional register being // allocated and an unnecessary move instruction being generated. - needsTOCRestore = true; + CallOpc = PPCISD::BCTRL_LOAD_TOC; + + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT); + unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); + SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset); + SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff); + + // The address needs to go after the chain input but before the flag (or + // any other variadic arguments). + Ops.insert(std::next(Ops.begin()), AddTOC); } else if ((CallOpc == PPCISD::CALL) && (!isLocalCall(Callee) || - DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { + DAG.getTarget().getRelocationModel() == Reloc::PIC_)) // Otherwise insert NOP for non-local calls. CallOpc = PPCISD::CALL_NOP; - } else if (CallOpc == PPCISD::CALL_TLS) - // For 64-bit SVR4, TLS calls are always non-local. - CallOpc = PPCISD::CALL_NOP_TLS; } Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); InFlag = Chain.getValue(1); - if (needsTOCRestore) { - SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); - SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT); - unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(isELFv2ABI); - SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset); - SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff); - Chain = DAG.getNode(PPCISD::LOAD_TOC, dl, VTs, Chain, AddTOC, InFlag); - InFlag = Chain.getValue(1); - } - Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), DAG.getIntPtrConstant(BytesCalleePops, true), InFlag, dl); @@ -3924,40 +3997,43 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool &isTailCall = CLI.IsTailCall; CallingConv::ID CallConv = CLI.CallConv; bool isVarArg = CLI.IsVarArg; + bool IsPatchPoint = CLI.IsPatchPoint; + ImmutableCallSite *CS = CLI.CS; if (isTailCall) isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, Ins, DAG); - if (!isTailCall && CLI.CS && CLI.CS->isMustTailCall()) + if (!isTailCall && CS && CS->isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); if (Subtarget.isSVR4ABI()) { if (Subtarget.isPPC64()) return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg, - isTailCall, Outs, OutVals, Ins, - dl, DAG, InVals); + isTailCall, IsPatchPoint, Outs, OutVals, Ins, + dl, DAG, InVals, CS); else return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg, - isTailCall, Outs, OutVals, Ins, - dl, DAG, InVals); + isTailCall, IsPatchPoint, Outs, OutVals, Ins, + dl, DAG, InVals, CS); } return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg, - isTailCall, Outs, OutVals, Ins, - dl, DAG, InVals); + isTailCall, IsPatchPoint, Outs, OutVals, Ins, + dl, DAG, InVals, CS); } SDValue PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, - bool isTailCall, + bool isTailCall, bool IsPatchPoint, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, SDLoc dl, SelectionDAG &DAG, - SmallVectorImpl &InVals) const { + SmallVectorImpl &InVals, + ImmutableCallSite *CS) const { // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description // of the 32-bit SVR4 ABI stack frame layout. @@ -3987,7 +4063,7 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee, *DAG.getContext()); // Reserve space for the linkage area on the stack. - CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false, false), + CCInfo.AllocateStack(Subtarget.getFrameLowering()->getLinkageSize(), PtrByteSize); if (isVarArg) { @@ -4162,9 +4238,9 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee, PrepareTailCall(DAG, InFlag, Chain, dl, false, SPDiff, NumBytes, LROp, FPOp, false, TailCallArguments); - return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG, - RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes, - Ins, InVals); + return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, DAG, + RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff, + NumBytes, Ins, InVals, CS); } // Copy an argument into memory, being careful to do this outside the @@ -4190,12 +4266,13 @@ PPCTargetLowering::createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff, SDValue PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, - bool isTailCall, + bool isTailCall, bool IsPatchPoint, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, SDLoc dl, SelectionDAG &DAG, - SmallVectorImpl &InVals) const { + SmallVectorImpl &InVals, + ImmutableCallSite *CS) const { bool isELFv2ABI = Subtarget.isELFv2ABI(); bool isLittleEndian = Subtarget.isLittleEndian(); @@ -4215,13 +4292,39 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, CallConv == CallingConv::Fast) MF.getInfo()->setHasFastCall(); + assert(!(CallConv == CallingConv::Fast && isVarArg) && + "fastcc not supported on varargs functions"); + // Count how many bytes are to be pushed on the stack, including the linkage // area, and parameter passing area. On ELFv1, the linkage area is 48 bytes // reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage // area is 32 bytes reserved space for [SP][CR][LR][TOC]. - unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false, - isELFv2ABI); + unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); unsigned NumBytes = LinkageSize; + unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; + + static const MCPhysReg GPR[] = { + PPC::X3, PPC::X4, PPC::X5, PPC::X6, + PPC::X7, PPC::X8, PPC::X9, PPC::X10, + }; + static const MCPhysReg *FPR = GetFPR(); + + static const MCPhysReg VR[] = { + PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, + PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 + }; + static const MCPhysReg VSRH[] = { + PPC::VSH2, PPC::VSH3, PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7, PPC::VSH8, + PPC::VSH9, PPC::VSH10, PPC::VSH11, PPC::VSH12, PPC::VSH13 + }; + + const unsigned NumGPRs = array_lengthof(GPR); + const unsigned NumFPRs = 13; + const unsigned NumVRs = array_lengthof(VR); + + // When using the fast calling convention, we don't provide backing for + // arguments that will be in registers. + unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0; // Add up all the space actually used. for (unsigned i = 0; i != NumOps; ++i) { @@ -4229,6 +4332,35 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, EVT ArgVT = Outs[i].VT; EVT OrigVT = Outs[i].ArgVT; + if (CallConv == CallingConv::Fast) { + if (Flags.isByVal()) + NumGPRsUsed += (Flags.getByValSize()+7)/8; + else + switch (ArgVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unexpected ValueType for argument!"); + case MVT::i1: + case MVT::i32: + case MVT::i64: + if (++NumGPRsUsed <= NumGPRs) + continue; + break; + case MVT::f32: + case MVT::f64: + if (++NumFPRsUsed <= NumFPRs) + continue; + break; + case MVT::v4f32: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v16i8: + case MVT::v2f64: + case MVT::v2i64: + if (++NumVRsUsed <= NumVRs) + continue; + break; + } + } + /* Respect alignment of argument on the stack. */ unsigned Align = CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); @@ -4252,7 +4384,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, // Tail call needs the stack to be aligned. if (getTargetMachine().Options.GuaranteedTailCallOpt && CallConv == CallingConv::Fast) - NumBytes = EnsureStackAlignment(MF.getTarget(), NumBytes); + NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes); // Calculate by how many bytes the stack has to be adjusted in case of tail // call optimization. @@ -4285,26 +4417,6 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, // must be stored to our stack, and loaded into integer regs as well, if // any integer regs are available for argument passing. unsigned ArgOffset = LinkageSize; - unsigned GPR_idx, FPR_idx = 0, VR_idx = 0; - - static const MCPhysReg GPR[] = { - PPC::X3, PPC::X4, PPC::X5, PPC::X6, - PPC::X7, PPC::X8, PPC::X9, PPC::X10, - }; - static const MCPhysReg *FPR = GetFPR(); - - static const MCPhysReg VR[] = { - PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, - PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 - }; - static const MCPhysReg VSRH[] = { - PPC::VSH2, PPC::VSH3, PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7, PPC::VSH8, - PPC::VSH9, PPC::VSH10, PPC::VSH11, PPC::VSH12, PPC::VSH13 - }; - - const unsigned NumGPRs = array_lengthof(GPR); - const unsigned NumFPRs = 13; - const unsigned NumVRs = array_lengthof(VR); SmallVector, 8> RegsToPass; SmallVector TailCallArguments; @@ -4316,22 +4428,31 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, EVT ArgVT = Outs[i].VT; EVT OrigVT = Outs[i].ArgVT; - /* Respect alignment of argument on the stack. */ - unsigned Align = - CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); - ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; - - /* Compute GPR index associated with argument offset. */ - GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; - GPR_idx = std::min(GPR_idx, NumGPRs); - // PtrOff will be used to store the current argument to the stack if a // register cannot be found for it. SDValue PtrOff; - PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType()); + // We re-align the argument offset for each argument, except when using the + // fast calling convention, when we need to make sure we do that only when + // we'll actually use a stack slot. + auto ComputePtrOff = [&]() { + /* Respect alignment of argument on the stack. */ + unsigned Align = + CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); + ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; - PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); + PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType()); + + PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); + }; + + if (CallConv != CallingConv::Fast) { + ComputePtrOff(); + + /* Compute GPR index associated with argument offset. */ + GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; + GPR_idx = std::min(GPR_idx, NumGPRs); + } // Promote integers to 64-bit values. if (Arg.getValueType() == MVT::i32 || Arg.getValueType() == MVT::i1) { @@ -4356,6 +4477,9 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, if (Size == 0) continue; + if (CallConv == CallingConv::Fast) + ComputePtrOff(); + // All aggregates smaller than 8 bytes must be passed right-justified. if (Size==1 || Size==2 || Size==4) { EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32); @@ -4364,7 +4488,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, MachinePointerInfo(), VT, false, false, false, 0); MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); ArgOffset += PtrByteSize; continue; @@ -4426,7 +4550,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, MachinePointerInfo(), false, false, false, 0); MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); // Done with this argument. ArgOffset += PtrByteSize; @@ -4462,13 +4586,19 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, // passed directly. Clang may use those instead of "byval" aggregate // types to avoid forcing arguments to memory unnecessarily. if (GPR_idx != NumGPRs) { - RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Arg)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg)); } else { + if (CallConv == CallingConv::Fast) + ComputePtrOff(); + LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, true, isTailCall, false, MemOpChains, TailCallArguments, dl); + if (CallConv == CallingConv::Fast) + ArgOffset += PtrByteSize; } - ArgOffset += PtrByteSize; + if (CallConv != CallingConv::Fast) + ArgOffset += PtrByteSize; break; case MVT::f32: case MVT::f64: { @@ -4482,6 +4612,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, // then the parameter save area. For now, put all arguments to vararg // routines always in both locations (FPR *and* GPR or stack slot). bool NeedGPROrStack = isVarArg || FPR_idx == NumFPRs; + bool NeededLoad = false; // First load the argument into the next available FPR. if (FPR_idx != NumFPRs) @@ -4490,7 +4621,10 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, // Next, load the argument into GPR or stack slot if needed. if (!NeedGPROrStack) ; - else if (GPR_idx != NumGPRs) { + else if (GPR_idx != NumGPRs && CallConv != CallingConv::Fast) { + // FIXME: We may want to re-enable this for CallingConv::Fast on the P8 + // once we support fp <-> gpr moves. + // In the non-vararg case, this can only ever happen in the // presence of f32 array types, since otherwise we never run // out of FPRs before running out of GPRs. @@ -4529,8 +4663,11 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, ArgVal = SDValue(); if (ArgVal.getNode()) - RegsToPass.push_back(std::make_pair(GPR[GPR_idx], ArgVal)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], ArgVal)); } else { + if (CallConv == CallingConv::Fast) + ComputePtrOff(); + // Single-precision floating-point values are mapped to the // second (rightmost) word of the stack doubleword. if (Arg.getValueType() == MVT::f32 && @@ -4542,14 +4679,18 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, true, isTailCall, false, MemOpChains, TailCallArguments, dl); + + NeededLoad = true; } // When passing an array of floats, the array occupies consecutive // space in the argument area; only round up to the next doubleword // at the end of the array. Otherwise, each float takes 8 bytes. - ArgOffset += (Arg.getValueType() == MVT::f32 && - Flags.isInConsecutiveRegs()) ? 4 : 8; - if (Flags.isInConsecutiveRegsLast()) - ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + if (CallConv != CallingConv::Fast || NeededLoad) { + ArgOffset += (Arg.getValueType() == MVT::f32 && + Flags.isInConsecutiveRegs()) ? 4 : 8; + if (Flags.isInConsecutiveRegsLast()) + ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + } break; } case MVT::v4f32: @@ -4608,11 +4749,18 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, RegsToPass.push_back(std::make_pair(VReg, Arg)); } else { + if (CallConv == CallingConv::Fast) + ComputePtrOff(); + LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, true, isTailCall, true, MemOpChains, TailCallArguments, dl); + if (CallConv == CallingConv::Fast) + ArgOffset += 16; } - ArgOffset += 16; + + if (CallConv != CallingConv::Fast) + ArgOffset += 16; break; } } @@ -4626,21 +4774,23 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, // Check if this is an indirect call (MTCTR/BCTRL). // See PrepareCall() for more information about calls through function // pointers in the 64-bit SVR4 ABI. - if (!isTailCall && - !dyn_cast(Callee) && - !dyn_cast(Callee)) { + if (!isTailCall && !IsPatchPoint && + !isFunctionGlobalAddress(Callee) && + !isa(Callee)) { // Load r2 into a virtual register and store it to the TOC save area. + setUsesTOCBasePtr(DAG); SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64); // TOC save area offset. - unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(isELFv2ABI); + unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset); SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); - Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, MachinePointerInfo(), + Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, + MachinePointerInfo::getStack(TOCSaveOffset), false, false, 0); // In the ELFv2 ABI, R12 must contain the address of an indirect callee. // This does not mean the MTCTR instruction must use R12; it's easier // to model this as an extra parameter, so do that. - if (isELFv2ABI) + if (isELFv2ABI && !IsPatchPoint) RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee)); } @@ -4657,20 +4807,21 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, PrepareTailCall(DAG, InFlag, Chain, dl, true, SPDiff, NumBytes, LROp, FPOp, true, TailCallArguments); - return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG, - RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes, - Ins, InVals); + return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, DAG, + RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff, + NumBytes, Ins, InVals, CS); } SDValue PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, - bool isTailCall, + bool isTailCall, bool IsPatchPoint, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, SDLoc dl, SelectionDAG &DAG, - SmallVectorImpl &InVals) const { + SmallVectorImpl &InVals, + ImmutableCallSite *CS) const { unsigned NumOps = Outs.size(); @@ -4692,8 +4843,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, // Count how many bytes are to be pushed on the stack, including the linkage // area, and parameter passing area. We start with 24/48 bytes, which is // prereserved space for [SP][CR][LR][3 x unused]. - unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true, - false); + unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); unsigned NumBytes = LinkageSize; // Add up all the space actually used. @@ -4738,7 +4888,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, // Tail call needs the stack to be aligned. if (getTargetMachine().Options.GuaranteedTailCallOpt && CallConv == CallingConv::Fast) - NumBytes = EnsureStackAlignment(MF.getTarget(), NumBytes); + NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes); // Calculate by how many bytes the stack has to be adjusted in case of tail // call optimization. @@ -5031,8 +5181,8 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, // not mean the MTCTR instruction must use R12; it's easier to model this as // an extra parameter, so do that. if (!isTailCall && - !dyn_cast(Callee) && - !dyn_cast(Callee) && + !isFunctionGlobalAddress(Callee) && + !isa(Callee) && !isBLACompatibleAddress(Callee, DAG)) RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 : PPC::R12), Callee)); @@ -5050,9 +5200,9 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, PrepareTailCall(DAG, InFlag, Chain, dl, isPPC64, SPDiff, NumBytes, LROp, FPOp, true, TailCallArguments); - return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG, - RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes, - Ins, InVals); + return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, DAG, + RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff, + NumBytes, Ins, InVals, CS); } bool @@ -5151,7 +5301,6 @@ SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool isPPC64 = Subtarget.isPPC64(); - bool isDarwinABI = Subtarget.isDarwinABI(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); // Get current frame pointer save index. The users of this index will be @@ -5162,7 +5311,7 @@ PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const { // If the frame pointer save index hasn't been defined yet. if (!RASI) { // Find out what the fix offset of the frame pointer save area. - int LROffset = PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI); + int LROffset = Subtarget.getFrameLowering()->getReturnSaveOffset(); // Allocate the frame index for frame pointer save area. RASI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, LROffset, false); // Save the result. @@ -5175,7 +5324,6 @@ SDValue PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool isPPC64 = Subtarget.isPPC64(); - bool isDarwinABI = Subtarget.isDarwinABI(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); // Get current frame pointer save index. The users of this index will be @@ -5186,9 +5334,7 @@ PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const { // If the frame pointer save index hasn't been defined yet. if (!FPSI) { // Find out what the fix offset of the frame pointer save area. - int FPOffset = PPCFrameLowering::getFramePointerSaveOffset(isPPC64, - isDarwinABI); - + int FPOffset = Subtarget.getFrameLowering()->getFramePointerSaveOffset(); // Allocate the frame index for frame pointer save area. FPSI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, FPOffset, true); // Save the result. @@ -5382,9 +5528,9 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { return Op; } -// FIXME: Split this code up when LegalizeDAGTypes lands. -SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, - SDLoc dl) const { +void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI, + SelectionDAG &DAG, + SDLoc dl) const { assert(Op.getOperand(0).getValueType().isFloatingPoint()); SDValue Src = Op.getOperand(0); if (Src.getValueType() == MVT::f32) @@ -5394,10 +5540,11 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, switch (Op.getSimpleValueType().SimpleTy) { default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!"); case MVT::i32: - Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIWZ : - (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : - PPCISD::FCTIDZ), - dl, MVT::f64, Src); + Tmp = DAG.getNode( + Op.getOpcode() == ISD::FP_TO_SINT + ? PPCISD::FCTIWZ + : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ), + dl, MVT::f64, Src); break; case MVT::i64: assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) && @@ -5433,15 +5580,95 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, if (Op.getValueType() == MVT::i32 && !i32Stack) { FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr, DAG.getConstant(4, FIPtr.getValueType())); - MPI = MachinePointerInfo(); + MPI = MPI.getWithOffset(4); } - return DAG.getLoad(Op.getValueType(), dl, Chain, FIPtr, MPI, - false, false, false, 0); + RLI.Chain = Chain; + RLI.Ptr = FIPtr; + RLI.MPI = MPI; +} + +SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, + SDLoc dl) const { + ReuseLoadInfo RLI; + LowerFP_TO_INTForReuse(Op, RLI, DAG, dl); + + return DAG.getLoad(Op.getValueType(), dl, RLI.Chain, RLI.Ptr, RLI.MPI, false, + false, RLI.IsInvariant, RLI.Alignment, RLI.AAInfo, + RLI.Ranges); +} + +// We're trying to insert a regular store, S, and then a load, L. If the +// incoming value, O, is a load, we might just be able to have our load use the +// address used by O. However, we don't know if anything else will store to +// that address before we can load from it. To prevent this situation, we need +// to insert our load, L, into the chain as a peer of O. To do this, we give L +// the same chain operand as O, we create a token factor from the chain results +// of O and L, and we replace all uses of O's chain result with that token +// factor (see spliceIntoChain below for this last part). +bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT, + ReuseLoadInfo &RLI, + SelectionDAG &DAG, + ISD::LoadExtType ET) const { + SDLoc dl(Op); + if (ET == ISD::NON_EXTLOAD && + (Op.getOpcode() == ISD::FP_TO_UINT || + Op.getOpcode() == ISD::FP_TO_SINT) && + isOperationLegalOrCustom(Op.getOpcode(), + Op.getOperand(0).getValueType())) { + + LowerFP_TO_INTForReuse(Op, RLI, DAG, dl); + return true; + } + + LoadSDNode *LD = dyn_cast(Op); + if (!LD || LD->getExtensionType() != ET || LD->isVolatile() || + LD->isNonTemporal()) + return false; + if (LD->getMemoryVT() != MemVT) + return false; + + RLI.Ptr = LD->getBasePtr(); + if (LD->isIndexed() && LD->getOffset().getOpcode() != ISD::UNDEF) { + assert(LD->getAddressingMode() == ISD::PRE_INC && + "Non-pre-inc AM on PPC?"); + RLI.Ptr = DAG.getNode(ISD::ADD, dl, RLI.Ptr.getValueType(), RLI.Ptr, + LD->getOffset()); + } + + RLI.Chain = LD->getChain(); + RLI.MPI = LD->getPointerInfo(); + RLI.IsInvariant = LD->isInvariant(); + RLI.Alignment = LD->getAlignment(); + RLI.AAInfo = LD->getAAInfo(); + RLI.Ranges = LD->getRanges(); + + RLI.ResChain = SDValue(LD, LD->isIndexed() ? 2 : 1); + return true; +} + +// Given the head of the old chain, ResChain, insert a token factor containing +// it and NewResChain, and make users of ResChain now be users of that token +// factor. +void PPCTargetLowering::spliceIntoChain(SDValue ResChain, + SDValue NewResChain, + SelectionDAG &DAG) const { + if (!ResChain) + return; + + SDLoc dl(NewResChain); + + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + NewResChain, DAG.getUNDEF(MVT::Other)); + assert(TF.getNode() != NewResChain.getNode() && + "A new TF really is required here"); + + DAG.ReplaceAllUsesOfValueWith(ResChain, TF); + DAG.UpdateNodeOperands(TF.getNode(), ResChain, NewResChain); } SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, - SelectionDAG &DAG) const { + SelectionDAG &DAG) const { SDLoc dl(Op); // Don't handle ppc_fp128 here; let it be lowered to a libcall. if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) @@ -5457,13 +5684,14 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, // If we have FCFIDS, then use it when converting to single-precision. // Otherwise, convert to double-precision and then round. - unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ? - (Op.getOpcode() == ISD::UINT_TO_FP ? - PPCISD::FCFIDUS : PPCISD::FCFIDS) : - (Op.getOpcode() == ISD::UINT_TO_FP ? - PPCISD::FCFIDU : PPCISD::FCFID); - MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ? - MVT::f32 : MVT::f64; + unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) + ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS + : PPCISD::FCFIDS) + : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU + : PPCISD::FCFID); + MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) + ? MVT::f32 + : MVT::f64; if (Op.getOperand(0).getValueType() == MVT::i64) { SDValue SINT = Op.getOperand(0); @@ -5513,7 +5741,70 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT); } - SDValue Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT); + ReuseLoadInfo RLI; + SDValue Bits; + + MachineFunction &MF = DAG.getMachineFunction(); + if (canReuseLoadAddress(SINT, MVT::i64, RLI, DAG)) { + Bits = DAG.getLoad(MVT::f64, dl, RLI.Chain, RLI.Ptr, RLI.MPI, false, + false, RLI.IsInvariant, RLI.Alignment, RLI.AAInfo, + RLI.Ranges); + spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); + } else if (Subtarget.hasLFIWAX() && + canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::SEXTLOAD)) { + MachineMemOperand *MMO = + MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, + RLI.Alignment, RLI.AAInfo, RLI.Ranges); + SDValue Ops[] = { RLI.Chain, RLI.Ptr }; + Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWAX, dl, + DAG.getVTList(MVT::f64, MVT::Other), + Ops, MVT::i32, MMO); + spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); + } else if (Subtarget.hasFPCVT() && + canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::ZEXTLOAD)) { + MachineMemOperand *MMO = + MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, + RLI.Alignment, RLI.AAInfo, RLI.Ranges); + SDValue Ops[] = { RLI.Chain, RLI.Ptr }; + Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWZX, dl, + DAG.getVTList(MVT::f64, MVT::Other), + Ops, MVT::i32, MMO); + spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); + } else if (((Subtarget.hasLFIWAX() && + SINT.getOpcode() == ISD::SIGN_EXTEND) || + (Subtarget.hasFPCVT() && + SINT.getOpcode() == ISD::ZERO_EXTEND)) && + SINT.getOperand(0).getValueType() == MVT::i32) { + MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + + int FrameIdx = FrameInfo->CreateStackObject(4, 4, false); + SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); + + SDValue Store = + DAG.getStore(DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx, + MachinePointerInfo::getFixedStack(FrameIdx), + false, false, 0); + + assert(cast(Store)->getMemoryVT() == MVT::i32 && + "Expected an i32 store"); + + RLI.Ptr = FIdx; + RLI.Chain = Store; + RLI.MPI = MachinePointerInfo::getFixedStack(FrameIdx); + RLI.Alignment = 4; + + MachineMemOperand *MMO = + MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, + RLI.Alignment, RLI.AAInfo, RLI.Ranges); + SDValue Ops[] = { RLI.Chain, RLI.Ptr }; + Bits = DAG.getMemIntrinsicNode(SINT.getOpcode() == ISD::ZERO_EXTEND ? + PPCISD::LFIWZX : PPCISD::LFIWAX, + dl, DAG.getVTList(MVT::f64, MVT::Other), + Ops, MVT::i32, MMO); + } else + Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT); + SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits); if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) @@ -5534,23 +5825,36 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, SDValue Ld; if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) { - int FrameIdx = FrameInfo->CreateStackObject(4, 4, false); - SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); - - SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx, - MachinePointerInfo::getFixedStack(FrameIdx), - false, false, 0); + ReuseLoadInfo RLI; + bool ReusingLoad; + if (!(ReusingLoad = canReuseLoadAddress(Op.getOperand(0), MVT::i32, RLI, + DAG))) { + int FrameIdx = FrameInfo->CreateStackObject(4, 4, false); + SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); + + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx, + MachinePointerInfo::getFixedStack(FrameIdx), + false, false, 0); + + assert(cast(Store)->getMemoryVT() == MVT::i32 && + "Expected an i32 store"); + + RLI.Ptr = FIdx; + RLI.Chain = Store; + RLI.MPI = MachinePointerInfo::getFixedStack(FrameIdx); + RLI.Alignment = 4; + } - assert(cast(Store)->getMemoryVT() == MVT::i32 && - "Expected an i32 store"); MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx), - MachineMemOperand::MOLoad, 4, 4); - SDValue Ops[] = { Store, FIdx }; + MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, + RLI.Alignment, RLI.AAInfo, RLI.Ranges); + SDValue Ops[] = { RLI.Chain, RLI.Ptr }; Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::LFIWZX : PPCISD::LFIWAX, dl, DAG.getVTList(MVT::f64, MVT::Other), Ops, MVT::i32, MMO); + if (ReusingLoad) + spliceIntoChain(RLI.ResChain, Ld.getValue(1), DAG); } else { assert(Subtarget.isPPC64() && "i32->FP without LFIWAX supported only on PPC64"); @@ -6463,7 +6767,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::FP_TO_UINT: case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, - SDLoc(Op)); + SDLoc(Op)); case ISD::UINT_TO_FP: case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); @@ -6493,7 +6797,6 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { void PPCTargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl&Results, SelectionDAG &DAG) const { - const TargetMachine &TM = getTargetMachine(); SDLoc dl(N); switch (N->getOpcode()) { default: @@ -6524,8 +6827,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, break; } case ISD::VAARG: { - if (!TM.getSubtarget().isSVR4ABI() - || TM.getSubtarget().isPPC64()) + if (!Subtarget.isSVR4ABI() || Subtarget.isPPC64()) return; EVT VT = N->getValueType(0); @@ -6607,8 +6909,7 @@ MachineBasicBlock * PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, bool is64bit, unsigned BinOpcode) const { // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. - const TargetInstrInfo *TII = - getTargetMachine().getSubtargetImpl()->getInstrInfo(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction *F = BB->getParent(); @@ -6669,8 +6970,7 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI, bool is8bit, // operation unsigned BinOpcode) const { // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. - const TargetInstrInfo *TII = - getTargetMachine().getSubtargetImpl()->getInstrInfo(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); // In 64 bit mode we have to use 64 bits for addresses, even though the // lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address // registers without caring whether they're 32 or 64, but here we're @@ -6797,8 +7097,7 @@ llvm::MachineBasicBlock* PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); - const TargetInstrInfo *TII = - getTargetMachine().getSubtargetImpl()->getInstrInfo(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -6871,6 +7170,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, unsigned BufReg = MI->getOperand(1).getReg(); if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) { + setUsesTOCBasePtr(*MBB->getParent()); MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD)) .addReg(PPC::X2) .addImm(TOCOffset) @@ -6889,15 +7189,14 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MIB = BuildMI(*thisMBB, MI, DL, TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW)) - .addReg(BaseReg) - .addImm(BPOffset) - .addReg(BufReg); + .addReg(BaseReg) + .addImm(BPOffset) + .addReg(BufReg); MIB.setMemRefs(MMOBegin, MMOEnd); // Setup MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB); - const PPCRegisterInfo *TRI = - getTargetMachine().getSubtarget().getRegisterInfo(); + const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); MIB.addRegMask(TRI->getNoPreservedMask()); BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1); @@ -6911,8 +7210,9 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, // mainMBB: // mainDstReg = 0 - MIB = BuildMI(mainMBB, DL, - TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg); + MIB = + BuildMI(mainMBB, DL, + TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg); // Store IP if (Subtarget.isPPC64()) { @@ -6946,8 +7246,7 @@ MachineBasicBlock * PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); - const TargetInstrInfo *TII = - getTargetMachine().getSubtargetImpl()->getInstrInfo(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -6966,10 +7265,13 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, // Since FP is only updated here but NOT referenced, it's treated as GPR. unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31; unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1; - unsigned BP = (PVT == MVT::i64) ? PPC::X30 : - (Subtarget.isSVR4ABI() && - MF->getTarget().getRelocationModel() == Reloc::PIC_ ? - PPC::R29 : PPC::R30); + unsigned BP = + (PVT == MVT::i64) + ? PPC::X30 + : (Subtarget.isSVR4ABI() && + MF->getTarget().getRelocationModel() == Reloc::PIC_ + ? PPC::R29 + : PPC::R30); MachineInstrBuilder MIB; @@ -7032,6 +7334,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, // Reload TOC if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) { + setUsesTOCBasePtr(*MBB->getParent()); MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2) .addImm(TOCOffset) .addReg(BufReg); @@ -7051,6 +7354,22 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, MachineBasicBlock * PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { + if (MI->getOpcode() == TargetOpcode::STACKMAP || + MI->getOpcode() == TargetOpcode::PATCHPOINT) { + if (Subtarget.isPPC64() && Subtarget.isSVR4ABI() && + MI->getOpcode() == TargetOpcode::PATCHPOINT) { + // Call lowering should have added an r2 operand to indicate a dependence + // on the TOC base pointer value. It can't however, because there is no + // way to mark the dependence as implicit there, and so the stackmap code + // will confuse it with a regular operand. Instead, add the dependence + // here. + setUsesTOCBasePtr(*BB->getParent()); + MI->addOperand(MachineOperand::CreateReg(PPC::X2, false, true)); + } + + return emitPatchPoint(MI, BB); + } + if (MI->getOpcode() == PPC::EH_SjLj_SetJmp32 || MI->getOpcode() == PPC::EH_SjLj_SetJmp64) { return emitEHSjLjSetJmp(MI, BB); @@ -7059,8 +7378,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, return emitEHSjLjLongJmp(MI, BB); } - const TargetInstrInfo *TII = - getTargetMachine().getSubtargetImpl()->getInstrInfo(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); // To "insert" these instructions we actually have to insert their // control-flow patterns. @@ -7071,9 +7389,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineFunction *F = BB->getParent(); if (Subtarget.hasISEL() && (MI->getOpcode() == PPC::SELECT_CC_I4 || - MI->getOpcode() == PPC::SELECT_CC_I8 || - MI->getOpcode() == PPC::SELECT_I4 || - MI->getOpcode() == PPC::SELECT_I8)) { + MI->getOpcode() == PPC::SELECT_CC_I8 || + MI->getOpcode() == PPC::SELECT_I4 || + MI->getOpcode() == PPC::SELECT_I8)) { SmallVector Cond; if (MI->getOpcode() == PPC::SELECT_CC_I4 || MI->getOpcode() == PPC::SELECT_CC_I8) @@ -7083,8 +7401,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, Cond.push_back(MI->getOperand(1)); DebugLoc dl = MI->getDebugLoc(); - const TargetInstrInfo *TII = - getTargetMachine().getSubtargetImpl()->getInstrInfo(); TII->insertSelect(*BB, MI, dl, MI->getOperand(0).getReg(), Cond, MI->getOperand(2).getReg(), MI->getOperand(3).getReg()); @@ -7505,7 +7821,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest).addReg(Src1).addReg(Src2); // Restore FPSCR value. - BuildMI(*BB, MI, dl, TII->get(PPC::MTFSF)).addImm(1).addReg(MFFSReg); + BuildMI(*BB, MI, dl, TII->get(PPC::MTFSFb)).addImm(1).addReg(MFFSReg); } else if (MI->getOpcode() == PPC::ANDIo_1_EQ_BIT || MI->getOpcode() == PPC::ANDIo_1_GT_BIT || MI->getOpcode() == PPC::ANDIo_1_EQ_BIT8 || @@ -7545,7 +7861,7 @@ SDValue PPCTargetLowering::getRsqrtEstimate(SDValue Operand, bool &UseOneConstNR) const { EVT VT = Operand.getValueType(); if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) || - (VT == MVT::f64 && Subtarget.hasFRSQRTE()) || + (VT == MVT::f64 && Subtarget.hasFRSQRTE()) || (VT == MVT::v4f32 && Subtarget.hasAltivec()) || (VT == MVT::v2f64 && Subtarget.hasVSX())) { // Convergence is quadratic, so we essentially double the number of digits @@ -7566,7 +7882,7 @@ SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, unsigned &RefinementSteps) const { EVT VT = Operand.getValueType(); if ((VT == MVT::f32 && Subtarget.hasFRES()) || - (VT == MVT::f64 && Subtarget.hasFRE()) || + (VT == MVT::f64 && Subtarget.hasFRE()) || (VT == MVT::v4f32 && Subtarget.hasAltivec()) || (VT == MVT::v2f64 && Subtarget.hasVSX())) { // Convergence is quadratic, so we essentially double the number of digits @@ -7778,8 +8094,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N, SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); - assert(Subtarget.useCRBits() && - "Expecting to be tracking CR bits"); + assert(Subtarget.useCRBits() && "Expecting to be tracking CR bits"); // If we're tracking CR bits, we need to be careful that we don't have: // trunc(binary-ops(zext(x), zext(y))) // or @@ -8075,10 +8390,8 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, N->getValueType(0) != MVT::i64) return SDValue(); - if (!((N->getOperand(0).getValueType() == MVT::i1 && - Subtarget.useCRBits()) || - (N->getOperand(0).getValueType() == MVT::i32 && - Subtarget.isPPC64()))) + if (!((N->getOperand(0).getValueType() == MVT::i1 && Subtarget.useCRBits()) || + (N->getOperand(0).getValueType() == MVT::i32 && Subtarget.isPPC64()))) return SDValue(); if (N->getOperand(0).getOpcode() != ISD::AND && @@ -8332,6 +8645,76 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, N->getOperand(0), ShiftCst), ShiftCst); } +SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N, + DAGCombinerInfo &DCI) const { + assert((N->getOpcode() == ISD::SINT_TO_FP || + N->getOpcode() == ISD::UINT_TO_FP) && + "Need an int -> FP conversion node here"); + + if (!Subtarget.has64BitSupport()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(N); + SDValue Op(N, 0); + + // Don't handle ppc_fp128 here or i1 conversions. + if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) + return SDValue(); + if (Op.getOperand(0).getValueType() == MVT::i1) + return SDValue(); + + // For i32 intermediate values, unfortunately, the conversion functions + // leave the upper 32 bits of the value are undefined. Within the set of + // scalar instructions, we have no method for zero- or sign-extending the + // value. Thus, we cannot handle i32 intermediate values here. + if (Op.getOperand(0).getValueType() == MVT::i32) + return SDValue(); + + assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) && + "UINT_TO_FP is supported only with FPCVT"); + + // If we have FCFIDS, then use it when converting to single-precision. + // Otherwise, convert to double-precision and then round. + unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) + ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS + : PPCISD::FCFIDS) + : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU + : PPCISD::FCFID); + MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) + ? MVT::f32 + : MVT::f64; + + // If we're converting from a float, to an int, and back to a float again, + // then we don't need the store/load pair at all. + if ((Op.getOperand(0).getOpcode() == ISD::FP_TO_UINT && + Subtarget.hasFPCVT()) || + (Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT)) { + SDValue Src = Op.getOperand(0).getOperand(0); + if (Src.getValueType() == MVT::f32) { + Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); + DCI.AddToWorklist(Src.getNode()); + } + + unsigned FCTOp = + Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIDZ : + PPCISD::FCTIDUZ; + + SDValue Tmp = DAG.getNode(FCTOp, dl, MVT::f64, Src); + SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Tmp); + + if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) { + FP = DAG.getNode(ISD::FP_ROUND, dl, + MVT::f32, FP, DAG.getIntPtrConstant(0)); + DCI.AddToWorklist(FP.getNode()); + } + + return FP; + } + + return SDValue(); +} + // expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for // builtins) into loads with swaps. SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N, @@ -8433,7 +8816,6 @@ SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N, SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { - const TargetMachine &TM = getTargetMachine(); SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); switch (N->getOpcode()) { @@ -8466,40 +8848,11 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SELECT_CC: return DAGCombineTruncBoolExt(N, DCI); case ISD::SINT_TO_FP: - if (TM.getSubtarget().has64BitSupport()) { - if (N->getOperand(0).getOpcode() == ISD::FP_TO_SINT) { - // Turn (sint_to_fp (fp_to_sint X)) -> fctidz/fcfid without load/stores. - // We allow the src/dst to be either f32/f64, but the intermediate - // type must be i64. - if (N->getOperand(0).getValueType() == MVT::i64 && - N->getOperand(0).getOperand(0).getValueType() != MVT::ppcf128) { - SDValue Val = N->getOperand(0).getOperand(0); - if (Val.getValueType() == MVT::f32) { - Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val); - DCI.AddToWorklist(Val.getNode()); - } - - Val = DAG.getNode(PPCISD::FCTIDZ, dl, MVT::f64, Val); - DCI.AddToWorklist(Val.getNode()); - Val = DAG.getNode(PPCISD::FCFID, dl, MVT::f64, Val); - DCI.AddToWorklist(Val.getNode()); - if (N->getValueType(0) == MVT::f32) { - Val = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Val, - DAG.getIntPtrConstant(0)); - DCI.AddToWorklist(Val.getNode()); - } - return Val; - } else if (N->getOperand(0).getValueType() == MVT::i32) { - // If the intermediate type is i32, we can avoid the load/store here - // too. - } - } - } - break; + case ISD::UINT_TO_FP: + return combineFPToIntToFP(N, DCI); case ISD::STORE: { // Turn STORE (FP_TO_SINT F) -> STFIWX(FCTIWZ(F)). - if (TM.getSubtarget().hasSTFIWX() && - !cast(N)->isTruncatingStore() && + if (Subtarget.hasSTFIWX() && !cast(N)->isTruncatingStore() && N->getOperand(1).getOpcode() == ISD::FP_TO_SINT && N->getOperand(1).getValueType() == MVT::i32 && N->getOperand(1).getOperand(0).getValueType() != MVT::ppcf128) { @@ -8530,8 +8883,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, N->getOperand(1).getNode()->hasOneUse() && (N->getOperand(1).getValueType() == MVT::i32 || N->getOperand(1).getValueType() == MVT::i16 || - (TM.getSubtarget().hasLDBRX() && - TM.getSubtarget().isPPC64() && + (Subtarget.hasLDBRX() && Subtarget.isPPC64() && N->getOperand(1).getValueType() == MVT::i64))) { SDValue BSwapOp = N->getOperand(1).getOperand(0); // Do an any-extend to 32-bits if this is a half-word input. @@ -8552,8 +8904,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, EVT VT = N->getOperand(1).getValueType(); if (VT.isSimple()) { MVT StoreVT = VT.getSimpleVT(); - if (TM.getSubtarget().hasVSX() && - TM.getSubtarget().isLittleEndian() && + if (Subtarget.hasVSX() && Subtarget.isLittleEndian() && (StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 || StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32)) return expandVSXStoreForLE(N, DCI); @@ -8567,8 +8918,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, // For little endian, VSX loads require generating lxvd2x/xxswapd. if (VT.isSimple()) { MVT LoadVT = VT.getSimpleVT(); - if (TM.getSubtarget().hasVSX() && - TM.getSubtarget().isLittleEndian() && + if (Subtarget.hasVSX() && Subtarget.isLittleEndian() && (LoadVT == MVT::v2f64 || LoadVT == MVT::v2i64 || LoadVT == MVT::v4f32 || LoadVT == MVT::v4i32)) return expandVSXLoadForLE(N, DCI); @@ -8576,12 +8926,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext()); unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty); - if (ISD::isNON_EXTLoad(N) && VT.isVector() && - TM.getSubtarget().hasAltivec() && + if (ISD::isNON_EXTLoad(N) && VT.isVector() && Subtarget.hasAltivec() && // P8 and later hardware should just use LOAD. - !TM.getSubtarget().hasP8Vector() && - (VT == MVT::v16i8 || VT == MVT::v8i16 || - VT == MVT::v4i32 || VT == MVT::v4f32) && + !Subtarget.hasP8Vector() && (VT == MVT::v16i8 || VT == MVT::v8i16 || + VT == MVT::v4i32 || VT == MVT::v4f32) && LD->getAlignment() < ABIAlignment) { // This is a type-legal unaligned Altivec load. SDValue Chain = LD->getChain(); @@ -8692,40 +9040,41 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } } break; - case ISD::INTRINSIC_WO_CHAIN: { - bool isLittleEndian = Subtarget.isLittleEndian(); - Intrinsic::ID Intr = (isLittleEndian ? - Intrinsic::ppc_altivec_lvsr : - Intrinsic::ppc_altivec_lvsl); - if (cast(N->getOperand(0))->getZExtValue() == Intr && - N->getOperand(1)->getOpcode() == ISD::ADD) { - SDValue Add = N->getOperand(1); - - if (DAG.MaskedValueIsZero(Add->getOperand(1), - APInt::getAllOnesValue(4 /* 16 byte alignment */).zext( - Add.getValueType().getScalarType().getSizeInBits()))) { - SDNode *BasePtr = Add->getOperand(0).getNode(); - for (SDNode::use_iterator UI = BasePtr->use_begin(), - UE = BasePtr->use_end(); UI != UE; ++UI) { - if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN && - cast(UI->getOperand(0))->getZExtValue() == - Intr) { - // We've found another LVSL/LVSR, and this address is an aligned - // multiple of that one. The results will be the same, so use the - // one we've just found instead. - - return SDValue(*UI, 0); + case ISD::INTRINSIC_WO_CHAIN: { + bool isLittleEndian = Subtarget.isLittleEndian(); + Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr + : Intrinsic::ppc_altivec_lvsl); + if (cast(N->getOperand(0))->getZExtValue() == Intr && + N->getOperand(1)->getOpcode() == ISD::ADD) { + SDValue Add = N->getOperand(1); + + if (DAG.MaskedValueIsZero( + Add->getOperand(1), + APInt::getAllOnesValue(4 /* 16 byte alignment */) + .zext( + Add.getValueType().getScalarType().getSizeInBits()))) { + SDNode *BasePtr = Add->getOperand(0).getNode(); + for (SDNode::use_iterator UI = BasePtr->use_begin(), + UE = BasePtr->use_end(); + UI != UE; ++UI) { + if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN && + cast(UI->getOperand(0))->getZExtValue() == + Intr) { + // We've found another LVSL/LVSR, and this address is an aligned + // multiple of that one. The results will be the same, so use the + // one we've just found instead. + + return SDValue(*UI, 0); + } } } } } - } break; case ISD::INTRINSIC_W_CHAIN: { // For little endian, VSX loads require generating lxvd2x/xxswapd. - if (TM.getSubtarget().hasVSX() && - TM.getSubtarget().isLittleEndian()) { + if (Subtarget.hasVSX() && Subtarget.isLittleEndian()) { switch (cast(N->getOperand(1))->getZExtValue()) { default: break; @@ -8738,8 +9087,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } case ISD::INTRINSIC_VOID: { // For little endian, VSX stores require generating xxswapd/stxvd2x. - if (TM.getSubtarget().hasVSX() && - TM.getSubtarget().isLittleEndian()) { + if (Subtarget.hasVSX() && Subtarget.isLittleEndian()) { switch (cast(N->getOperand(1))->getZExtValue()) { default: break; @@ -8755,8 +9103,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) && N->getOperand(0).hasOneUse() && (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16 || - (TM.getSubtarget().hasLDBRX() && - TM.getSubtarget().isPPC64() && + (Subtarget.hasLDBRX() && Subtarget.isPPC64() && N->getValueType(0) == MVT::i64))) { SDValue Load = N->getOperand(0); LoadSDNode *LD = cast(Load); @@ -9033,6 +9380,38 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, } } +unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { + switch (Subtarget.getDarwinDirective()) { + default: break; + case PPC::DIR_970: + case PPC::DIR_PWR4: + case PPC::DIR_PWR5: + case PPC::DIR_PWR5X: + case PPC::DIR_PWR6: + case PPC::DIR_PWR6X: + case PPC::DIR_PWR7: + case PPC::DIR_PWR8: { + if (!ML) + break; + + const PPCInstrInfo *TII = Subtarget.getInstrInfo(); + + // For small loops (between 5 and 8 instructions), align to a 32-byte + // boundary so that the entire loop fits in one instruction-cache line. + uint64_t LoopSize = 0; + for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I) + for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J) + LoopSize += TII->GetInstSizeInBytes(J); + + if (LoopSize > 16 && LoopSize <= 32) + return 5; + + break; + } + } + + return TargetLowering::getPrefLoopAlignment(ML); +} /// getConstraintType - Given a constraint, return the type of /// constraint it is for this target. @@ -9165,8 +9544,7 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, // the AsmName field from *RegisterInfo.td, then this would not be necessary. if (R.first && VT == MVT::i64 && Subtarget.isPPC64() && PPC::GPRCRegClass.contains(R.first)) { - const TargetRegisterInfo *TRI = - getTargetMachine().getSubtargetImpl()->getRegisterInfo(); + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); return std::make_pair(TRI->getMatchingSuperReg(R.first, PPC::sub_32, &PPC::G8RCRegClass), &PPC::G8RCRegClass); @@ -9310,14 +9688,12 @@ SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op, PPCFunctionInfo *FuncInfo = MF.getInfo(); FuncInfo->setLRStoreRequired(); bool isPPC64 = Subtarget.isPPC64(); - bool isDarwinABI = Subtarget.isDarwinABI(); if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); SDValue Offset = - - DAG.getConstant(PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI), - isPPC64? MVT::i64 : MVT::i32); + DAG.getConstant(Subtarget.getFrameLowering()->getReturnSaveOffset(), + isPPC64 ? MVT::i64 : MVT::i32); return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, getPointerTy(), FrameAddr, Offset), @@ -9374,7 +9750,7 @@ unsigned PPCTargetLowering::getRegisterByName(const char* RegName, bool is64Bit = isPPC64 && VT == MVT::i64; unsigned Reg = StringSwitch(RegName) .Case("r1", is64Bit ? PPC::X1 : PPC::R1) - .Case("r2", isDarwinABI ? 0 : (is64Bit ? PPC::X2 : PPC::R2)) + .Case("r2", (isDarwinABI || isPPC64) ? 0 : PPC::R2) .Case("r13", (!isPPC64 && isDarwinABI) ? 0 : (is64Bit ? PPC::X13 : PPC::R13)) .Default(0); @@ -9527,6 +9903,31 @@ bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { return NumBits1 == 64 && NumBits2 == 32; } +bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { + // Generally speaking, zexts are not free, but they are free when they can be + // folded with other operations. + if (LoadSDNode *LD = dyn_cast(Val)) { + EVT MemVT = LD->getMemoryVT(); + if ((MemVT == MVT::i1 || MemVT == MVT::i8 || MemVT == MVT::i16 || + (Subtarget.isPPC64() && MemVT == MVT::i32)) && + (LD->getExtensionType() == ISD::NON_EXTLOAD || + LD->getExtensionType() == ISD::ZEXTLOAD)) + return true; + } + + // FIXME: Add other cases... + // - 32-bit shifts with a zext to i64 + // - zext after ctlz, bswap, etc. + // - zext after and by a constant mask + + return TargetLowering::isZExtFree(Val, VT2); +} + +bool PPCTargetLowering::isFPExtFree(EVT VT) const { + assert(VT.isFloatingPoint()); + return true; +} + bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const { return isInt<16>(Imm) || isUInt<16>(Imm); } @@ -9587,6 +9988,19 @@ bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { return false; } +const MCPhysReg * +PPCTargetLowering::getScratchRegisters(CallingConv::ID) const { + // LR is a callee-save register, but we must treat it as clobbered by any call + // site. Hence we include LR in the scratch registers, which are in turn added + // as implicit-defs for stackmaps and patchpoints. The same reasoning applies + // to CTR, which is used by any indirect call. + static const MCPhysReg ScratchRegs[] = { + PPC::X12, PPC::LR8, PPC::CTR8, 0 + }; + + return ScratchRegs; +} + bool PPCTargetLowering::shouldExpandBuildVectorWithShuffles( EVT VT , unsigned DefinedValues) const {