X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FPowerPC%2FPPCISelLowering.cpp;h=9c686cb9467824348faee8be9b5f4d235cbc2a63;hb=46479197843ecb651adc9417c49bbd1b00acfcb6;hp=bdda8eaa46884a72e30c1d0ff5c45b6b67888a06;hpb=1c7d69bbe2de22f386ffd9ef4480f8a77be28130;p=oota-llvm.git diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index bdda8eaa468..9c686cb9467 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -12,15 +12,10 @@ //===----------------------------------------------------------------------===// #include "PPCISelLowering.h" +#include "MCTargetDesc/PPCPredicates.h" #include "PPCMachineFunctionInfo.h" #include "PPCPerfectShuffle.h" #include "PPCTargetMachine.h" -#include "MCTargetDesc/PPCPredicates.h" -#include "llvm/CallingConv.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Function.h" -#include "llvm/Intrinsics.h" #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -29,6 +24,11 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" @@ -36,20 +36,20 @@ #include "llvm/Target/TargetOptions.h" using namespace llvm; -static bool CC_PPC_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State); -static bool CC_PPC_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT, - MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State); -static bool CC_PPC_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT, +static bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State); +static bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, ISD::ArgFlagsTy &ArgFlags, CCState &State); +static bool CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State); static cl::opt DisablePPCPreinc("disable-ppc-preinc", cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden); @@ -57,6 +57,9 @@ cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden); static cl::opt DisableILPPref("disable-ppc-ilp-pref", cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden); +static cl::opt DisablePPCUnaligned("disable-ppc-unaligned", +cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden); + static TargetLoweringObjectFile *CreateTLOF(const PPCTargetMachine &TM) { if (TM.getSubtargetImpl()->isDarwin()) return new TargetLoweringObjectFileMachO(); @@ -67,6 +70,7 @@ static TargetLoweringObjectFile *CreateTLOF(const PPCTargetMachine &TM) { PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) : TargetLowering(TM, CreateTLOF(TM)), PPCSubTarget(*TM.getSubtargetImpl()) { const PPCSubtarget *Subtarget = &TM.getSubtarget(); + PPCRegInfo = TM.getRegisterInfo(); setPow2DivIsCheap(); @@ -132,11 +136,13 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // We don't support sin/cos/sqrt/fmod/pow setOperationAction(ISD::FSIN , MVT::f64, Expand); setOperationAction(ISD::FCOS , MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); setOperationAction(ISD::FREM , MVT::f64, Expand); setOperationAction(ISD::FPOW , MVT::f64, Expand); setOperationAction(ISD::FMA , MVT::f64, Legal); setOperationAction(ISD::FSIN , MVT::f32, Expand); setOperationAction(ISD::FCOS , MVT::f32, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); setOperationAction(ISD::FREM , MVT::f32, Expand); setOperationAction(ISD::FPOW , MVT::f32, Expand); setOperationAction(ISD::FMA , MVT::f32, Legal); @@ -152,18 +158,45 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); + if (Subtarget->hasFPRND()) { + setOperationAction(ISD::FFLOOR, MVT::f64, Legal); + setOperationAction(ISD::FCEIL, MVT::f64, Legal); + setOperationAction(ISD::FTRUNC, MVT::f64, Legal); + + setOperationAction(ISD::FFLOOR, MVT::f32, Legal); + setOperationAction(ISD::FCEIL, MVT::f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::f32, Legal); + + // frin does not implement "ties to even." Thus, this is safe only in + // fast-math mode. + if (TM.Options.UnsafeFPMath) { + setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); + + // These need to set FE_INEXACT, and use a custom inserter. + setOperationAction(ISD::FRINT, MVT::f64, Legal); + setOperationAction(ISD::FRINT, MVT::f32, Legal); + } + } + // PowerPC does not have BSWAP, CTPOP or CTTZ setOperationAction(ISD::BSWAP, MVT::i32 , Expand); - setOperationAction(ISD::CTPOP, MVT::i32 , Expand); setOperationAction(ISD::CTTZ , MVT::i32 , Expand); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); setOperationAction(ISD::BSWAP, MVT::i64 , Expand); - setOperationAction(ISD::CTPOP, MVT::i64 , Expand); setOperationAction(ISD::CTTZ , MVT::i64 , Expand); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); + if (Subtarget->hasPOPCNTD()) { + setOperationAction(ISD::CTPOP, MVT::i32 , Legal); + setOperationAction(ISD::CTPOP, MVT::i64 , Legal); + } else { + setOperationAction(ISD::CTPOP, MVT::i32 , Expand); + setOperationAction(ISD::CTPOP, MVT::i64 , Expand); + } + // PowerPC does not have ROTR setOperationAction(ISD::ROTR, MVT::i32 , Expand); setOperationAction(ISD::ROTR, MVT::i64 , Expand); @@ -206,6 +239,14 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); + // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support + // SjLj exception handling but a light-weight setjmp/longjmp replacement to + // support continuation, user-level threading, and etc.. As a result, no + // other SjLj exception interfaces are implemented and please don't build + // your own exception handling based on them. + // LLVM/Clang supports zero-cost DWARF exception handling. + setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); + setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); // We want to legalize GlobalAddress and ConstantPool nodes into the // appropriate instructions to materialize the address. @@ -285,15 +326,28 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // We cannot do this with Promote because i64 is not a legal type. setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); - // FIXME: disable this lowered code. This generates 64-bit register values, - // and we don't model the fact that the top part is clobbered by calls. We - // need to flag these together so that the value isn't live across a call. - //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); + if (PPCSubTarget.hasLFIWAX() || Subtarget->isPPC64()) + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); } else { // PowerPC does not have FP_TO_UINT on 32-bit implementations. setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand); } + // With the instructions enabled under FPCVT, we can do everything. + if (PPCSubTarget.hasFPCVT()) { + if (Subtarget->has64BitSupport()) { + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); + } + + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); + } + if (Subtarget->use64BitRegs()) { // 64-bit PowerPC implementations can support i64 types directly addRegisterClass(MVT::i64, &PPC::G8RCRegClass); @@ -347,6 +401,21 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::FDIV, VT, Expand); setOperationAction(ISD::FNEG, VT, Expand); + setOperationAction(ISD::FSQRT, VT, Expand); + setOperationAction(ISD::FLOG, VT, Expand); + setOperationAction(ISD::FLOG10, VT, Expand); + setOperationAction(ISD::FLOG2, VT, Expand); + setOperationAction(ISD::FEXP, VT, Expand); + setOperationAction(ISD::FEXP2, VT, Expand); + setOperationAction(ISD::FSIN, VT, Expand); + setOperationAction(ISD::FCOS, VT, Expand); + setOperationAction(ISD::FABS, VT, Expand); + setOperationAction(ISD::FPOWI, VT, Expand); + setOperationAction(ISD::FFLOOR, VT, Expand); + setOperationAction(ISD::FCEIL, VT, Expand); + setOperationAction(ISD::FTRUNC, VT, Expand); + setOperationAction(ISD::FRINT, VT, Expand); + setOperationAction(ISD::FNEARBYINT, VT, Expand); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); setOperationAction(ISD::BUILD_VECTOR, VT, Expand); @@ -361,6 +430,17 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::CTTZ, VT, Expand); setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); + setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); + + for (unsigned j = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; + j <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++j) { + MVT::SimpleValueType InnerVT = (MVT::SimpleValueType)j; + setTruncStoreAction(VT, InnerVT, Expand); + } + setLoadExtAction(ISD::SEXTLOAD, VT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, Expand); } // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle @@ -377,6 +457,10 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); + setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass); addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass); @@ -396,6 +480,14 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); + + // Altivec does not contain unordered floating-point compare instructions + setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand); + setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand); + setCondCodeAction(ISD::SETUGT, MVT::v4f32, Expand); + setCondCodeAction(ISD::SETUGE, MVT::v4f32, Expand); + setCondCodeAction(ISD::SETULT, MVT::v4f32, Expand); + setCondCodeAction(ISD::SETULE, MVT::v4f32, Expand); } if (Subtarget->has64BitSupport()) { @@ -405,6 +497,8 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand); setBooleanContents(ZeroOrOneBooleanContent); setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct? @@ -458,15 +552,14 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // friends. Gcc uses same threshold of 128 bytes (= 32 word stores). if (Subtarget->getDarwinDirective() == PPC::DIR_E500mc || Subtarget->getDarwinDirective() == PPC::DIR_E5500) { - maxStoresPerMemset = 32; - maxStoresPerMemsetOptSize = 16; - maxStoresPerMemcpy = 32; - maxStoresPerMemcpyOptSize = 8; - maxStoresPerMemmove = 32; - maxStoresPerMemmoveOptSize = 8; + MaxStoresPerMemset = 32; + MaxStoresPerMemsetOptSize = 16; + MaxStoresPerMemcpy = 32; + MaxStoresPerMemcpyOptSize = 8; + MaxStoresPerMemmove = 32; + MaxStoresPerMemmoveOptSize = 8; setPrefFunctionAlignment(4); - benefitFromCodePlacementOpt = true; } } @@ -512,16 +605,13 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::SRL: return "PPCISD::SRL"; case PPCISD::SRA: return "PPCISD::SRA"; case PPCISD::SHL: return "PPCISD::SHL"; - case PPCISD::EXTSW_32: return "PPCISD::EXTSW_32"; - case PPCISD::STD_32: return "PPCISD::STD_32"; - case PPCISD::CALL_SVR4: return "PPCISD::CALL_SVR4"; - case PPCISD::CALL_NOP_SVR4: return "PPCISD::CALL_NOP_SVR4"; - case PPCISD::CALL_Darwin: return "PPCISD::CALL_Darwin"; - case PPCISD::NOP: return "PPCISD::NOP"; + case PPCISD::CALL: return "PPCISD::CALL"; + case PPCISD::CALL_NOP: return "PPCISD::CALL_NOP"; case PPCISD::MTCTR: return "PPCISD::MTCTR"; - case PPCISD::BCTRL_Darwin: return "PPCISD::BCTRL_Darwin"; - case PPCISD::BCTRL_SVR4: return "PPCISD::BCTRL_SVR4"; + case PPCISD::BCTRL: return "PPCISD::BCTRL"; case PPCISD::RET_FLAG: return "PPCISD::RET_FLAG"; + case PPCISD::EH_SJLJ_SETJMP: return "PPCISD::EH_SJLJ_SETJMP"; + case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP"; case PPCISD::MFCR: return "PPCISD::MFCR"; case PPCISD::VCMP: return "PPCISD::VCMP"; case PPCISD::VCMPo: return "PPCISD::VCMPo"; @@ -531,13 +621,25 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::STCX: return "PPCISD::STCX"; case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH"; case PPCISD::MFFS: return "PPCISD::MFFS"; - case PPCISD::MTFSB0: return "PPCISD::MTFSB0"; - case PPCISD::MTFSB1: return "PPCISD::MTFSB1"; case PPCISD::FADDRTZ: return "PPCISD::FADDRTZ"; - case PPCISD::MTFSF: return "PPCISD::MTFSF"; case PPCISD::TC_RETURN: return "PPCISD::TC_RETURN"; case PPCISD::CR6SET: return "PPCISD::CR6SET"; case PPCISD::CR6UNSET: return "PPCISD::CR6UNSET"; + case PPCISD::ADDIS_TOC_HA: return "PPCISD::ADDIS_TOC_HA"; + case PPCISD::LD_TOC_L: return "PPCISD::LD_TOC_L"; + case PPCISD::ADDI_TOC_L: return "PPCISD::ADDI_TOC_L"; + case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA"; + case PPCISD::LD_GOT_TPREL_L: return "PPCISD::LD_GOT_TPREL_L"; + case PPCISD::ADD_TLS: return "PPCISD::ADD_TLS"; + case PPCISD::ADDIS_TLSGD_HA: return "PPCISD::ADDIS_TLSGD_HA"; + case PPCISD::ADDI_TLSGD_L: return "PPCISD::ADDI_TLSGD_L"; + case PPCISD::GET_TLS_ADDR: return "PPCISD::GET_TLS_ADDR"; + case PPCISD::ADDIS_TLSLD_HA: return "PPCISD::ADDIS_TLSLD_HA"; + case PPCISD::ADDI_TLSLD_L: return "PPCISD::ADDI_TLSLD_L"; + case PPCISD::GET_TLSLD_ADDR: return "PPCISD::GET_TLSLD_ADDR"; + case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA"; + case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L"; + case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT"; } } @@ -971,7 +1073,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, short Imm; if (isIntS16Immediate(CN, Imm)) { Disp = DAG.getTargetConstant(Imm, CN->getValueType(0)); - Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::X0 : PPC::R0, + Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, CN->getValueType(0)); return true; } @@ -1020,7 +1122,7 @@ bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base, } // Otherwise, do it the hard way, using R0 as the base register. - Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::X0 : PPC::R0, + Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, N.getValueType()); Index = N; return true; @@ -1083,7 +1185,7 @@ bool PPCTargetLowering::SelectAddressRegImmShift(SDValue N, SDValue &Disp, short Imm; if (isIntS16Immediate(CN, Imm)) { Disp = DAG.getTargetConstant((unsigned short)Imm >> 2, getPointerTy()); - Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::X0 : PPC::R0, + Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, CN->getValueType(0)); return true; } @@ -1121,15 +1223,19 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, SelectionDAG &DAG) const { if (DisablePPCPreinc) return false; + bool isLoad = true; SDValue Ptr; EVT VT; + unsigned Alignment; if (LoadSDNode *LD = dyn_cast(N)) { Ptr = LD->getBasePtr(); VT = LD->getMemoryVT(); - + Alignment = LD->getAlignment(); } else if (StoreSDNode *ST = dyn_cast(N)) { Ptr = ST->getBasePtr(); VT = ST->getMemoryVT(); + Alignment = ST->getAlignment(); + isLoad = false; } else return false; @@ -1137,7 +1243,25 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, if (VT.isVector()) return false; - if (SelectAddressRegReg(Ptr, Offset, Base, DAG)) { + if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) { + + // Common code will reject creating a pre-inc form if the base pointer + // is a frame index, or if N is a store and the base pointer is either + // the same as or a predecessor of the value being stored. Check for + // those situations here, and try with swapped Base/Offset instead. + bool Swap = false; + + if (isa(Base) || isa(Base)) + Swap = true; + else if (!isLoad) { + SDValue Val = cast(N)->getValue(); + if (Val == Base || Base.getNode()->isPredecessorOf(Val.getNode())) + Swap = true; + } + + if (Swap) + std::swap(Base, Offset); + AM = ISD::PRE_INC; return true; } @@ -1148,6 +1272,10 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, if (!SelectAddressRegImm(Ptr, Offset, Base, DAG)) return false; } else { + // LDU/STU need an address with at least 4-byte alignment. + if (Alignment < 4) + return false; + // reg + imm * 4. if (!SelectAddressRegImmShift(Ptr, Offset, Base, DAG)) return false; @@ -1284,19 +1412,81 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, EVT PtrVT = getPointerTy(); bool is64bit = PPCSubTarget.isPPC64(); - TLSModel::Model model = getTargetMachine().getTLSModel(GV); + TLSModel::Model Model = getTargetMachine().getTLSModel(GV); + + if (Model == TLSModel::LocalExec) { + SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, + PPCII::MO_TPREL16_HA); + SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, + PPCII::MO_TPREL16_LO); + SDValue TLSReg = DAG.getRegister(is64bit ? PPC::X13 : PPC::R2, + is64bit ? MVT::i64 : MVT::i32); + SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg); + return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi); + } + + if (!is64bit) + llvm_unreachable("only local-exec is currently supported for ppc32"); + + if (Model == TLSModel::InitialExec) { + SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); + SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); + SDValue TPOffsetHi = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl, + PtrVT, GOTReg, TGA); + SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl, + PtrVT, TGA, TPOffsetHi); + return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGA); + } + + if (Model == TLSModel::GeneralDynamic) { + SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); + SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); + SDValue GOTEntryHi = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT, + GOTReg, TGA); + SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSGD_L, dl, PtrVT, + GOTEntryHi, TGA); + + // We need a chain node, and don't have one handy. The underlying + // call has no side effects, so using the function entry node + // suffices. + SDValue Chain = DAG.getEntryNode(); + Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, GOTEntry); + SDValue ParmReg = DAG.getRegister(PPC::X3, MVT::i64); + SDValue TLSAddr = DAG.getNode(PPCISD::GET_TLS_ADDR, dl, + PtrVT, ParmReg, TGA); + // The return value from GET_TLS_ADDR really is in X3 already, but + // some hacks are needed here to tie everything together. The extra + // copies dissolve during subsequent transforms. + Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, TLSAddr); + return DAG.getCopyFromReg(Chain, dl, PPC::X3, PtrVT); + } - SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, - PPCII::MO_TPREL16_HA); - SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, - PPCII::MO_TPREL16_LO); + if (Model == TLSModel::LocalDynamic) { + SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); + SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); + SDValue GOTEntryHi = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT, + GOTReg, TGA); + SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSLD_L, dl, PtrVT, + GOTEntryHi, TGA); + + // We need a chain node, and don't have one handy. The underlying + // call has no side effects, so using the function entry node + // suffices. + SDValue Chain = DAG.getEntryNode(); + Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, GOTEntry); + SDValue ParmReg = DAG.getRegister(PPC::X3, MVT::i64); + SDValue TLSAddr = DAG.getNode(PPCISD::GET_TLSLD_ADDR, dl, + PtrVT, ParmReg, TGA); + // The return value from GET_TLSLD_ADDR really is in X3 already, but + // some hacks are needed here to tie everything together. The extra + // copies dissolve during subsequent transforms. + Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, TLSAddr); + SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl, PtrVT, + Chain, ParmReg, TGA); + return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA); + } - if (model != TLSModel::LocalExec) - llvm_unreachable("only local-exec TLS mode supported"); - SDValue TLSReg = DAG.getRegister(is64bit ? PPC::X13 : PPC::R2, - is64bit ? MVT::i64 : MVT::i32); - SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg); - return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi); + llvm_unreachable("Unknown TLS model!"); } SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op, @@ -1630,18 +1820,18 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG, #include "PPCGenCallingConv.inc" -static bool CC_PPC_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { +static bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { return true; } -static bool CC_PPC_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT, - MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { +static bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { static const uint16_t ArgRegs[] = { PPC::R3, PPC::R4, PPC::R5, PPC::R6, PPC::R7, PPC::R8, PPC::R9, PPC::R10, @@ -1664,11 +1854,11 @@ static bool CC_PPC_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT, return false; } -static bool CC_PPC_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT, - MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { +static bool CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { static const uint16_t ArgRegs[] = { PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, PPC::F8 @@ -1791,7 +1981,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( // Reserve space for the linkage area on the stack. CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize); - CCInfo.AnalyzeFormalArguments(Ins, CC_PPC_SVR4); + CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; @@ -1852,7 +2042,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( // Reserve stack space for the allocations in CCInfo. CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); - CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC_SVR4_ByVal); + CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal); // Area that is at least reserved in the caller of this function. unsigned MinReservedArea = CCByValInfo.getNextStackOffset(); @@ -1953,6 +2143,48 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( return Chain; } +// PPC64 passes i8, i16, and i32 values in i64 registers. Promote +// value to MVT::i64 and then truncate to the correct register size. +SDValue +PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT, + SelectionDAG &DAG, SDValue ArgVal, + DebugLoc dl) const { + if (Flags.isSExt()) + ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal, + DAG.getValueType(ObjectVT)); + else if (Flags.isZExt()) + ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal, + DAG.getValueType(ObjectVT)); + + return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal); +} + +// Set the size that is at least reserved in caller of this function. Tail +// call optimized functions' reserved stack space needs to be aligned so that +// taking the difference between two stack areas will result in an aligned +// stack. +void +PPCTargetLowering::setMinReservedArea(MachineFunction &MF, SelectionDAG &DAG, + unsigned nAltivecParamsAtEnd, + unsigned MinReservedArea, + bool isPPC64) const { + PPCFunctionInfo *FI = MF.getInfo(); + // Add the Altivec parameters at the end, if needed. + if (nAltivecParamsAtEnd) { + MinReservedArea = ((MinReservedArea+15)/16)*16; + MinReservedArea += 16*nAltivecParamsAtEnd; + } + MinReservedArea = + std::max(MinReservedArea, + PPCFrameLowering::getMinCallFrameSize(isPPC64, true)); + unsigned TargetAlign + = DAG.getMachineFunction().getTarget().getFrameLowering()-> + getStackAlignment(); + unsigned AlignMask = TargetAlign-1; + MinReservedArea = (MinReservedArea + AlignMask) & ~AlignMask; + FI->setMinReservedArea(MinReservedArea); +} + SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( SDValue Chain, @@ -2002,13 +2234,16 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( SmallVector MemOps; unsigned nAltivecParamsAtEnd = 0; Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin(); - for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo, ++FuncArg) { + unsigned CurArgIdx = 0; + for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { SDValue ArgVal; bool needsLoad = false; EVT ObjectVT = Ins[ArgNo].VT; unsigned ObjSize = ObjectVT.getSizeInBits()/8; unsigned ArgSize = ObjSize; ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; + std::advance(FuncArg, Ins[ArgNo].OrigArgIndex - CurArgIdx); + CurArgIdx = Ins[ArgNo].OrigArgIndex; unsigned CurArgOffset = ArgOffset; @@ -2034,33 +2269,62 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( // ObjSize is the true size, ArgSize rounded up to multiple of registers. ObjSize = Flags.getByValSize(); ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; - // All aggregates smaller than 8 bytes must be passed right-justified. - if (ObjSize==1 || ObjSize==2) { - CurArgOffset = CurArgOffset + (4 - ObjSize); + // Empty aggregate parameters do not take up registers. Examples: + // struct { } a; + // union { } b; + // int c[0]; + // etc. However, we have to provide a place-holder in InVals, so + // pretend we have an 8-byte item at the current address for that + // purpose. + if (!ObjSize) { + int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + InVals.push_back(FIN); + continue; } + // All aggregates smaller than 8 bytes must be passed right-justified. + if (ObjSize < PtrByteSize) + CurArgOffset = CurArgOffset + (PtrByteSize - ObjSize); // The value of the object is its address. int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); InVals.push_back(FIN); - if (ObjSize==1 || ObjSize==2 || ObjSize==4) { + + if (ObjSize < 8) { if (GPR_idx != Num_GPR_Regs) { - unsigned VReg; - VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); - EVT ObjType = (ObjSize == 1 ? MVT::i8 : - (ObjSize == 2 ? MVT::i16 : MVT::i32)); - SDValue Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo(FuncArg, - CurArgOffset), - ObjType, false, false, 0); + SDValue Store; + + if (ObjSize==1 || ObjSize==2 || ObjSize==4) { + EVT ObjType = (ObjSize == 1 ? MVT::i8 : + (ObjSize == 2 ? MVT::i16 : MVT::i32)); + Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo(FuncArg, CurArgOffset), + ObjType, false, false, 0); + } else { + // For sizes that don't fit a truncating store (3, 5, 6, 7), + // store the whole register as-is to the parameter save area + // slot. The address of the parameter was already calculated + // above (InVals.push_back(FIN)) to be the right-justified + // offset within the slot. For this store, we need a new + // frame index that points at the beginning of the slot. + int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo(FuncArg, ArgOffset), + false, false, 0); + } + MemOps.push_back(Store); ++GPR_idx; } - + // Whether we copied from a register or not, advance the offset + // into the parameter save area by a full doubleword. ArgOffset += PtrByteSize; - continue; } + for (unsigned j = 0; j < ArgSize; j += PtrByteSize) { // Store whatever pieces of the object are in registers // to memory. ArgOffset will be the address of the beginning @@ -2071,23 +2335,14 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); - SDValue Shifted = Val; - - // For 64-bit SVR4, small structs come in right-adjusted. - // Shift them left so the following logic works as expected. - if (ObjSize < 8) { - SDValue ShiftAmt = DAG.getConstant(64 - 8 * ObjSize, PtrVT); - Shifted = DAG.getNode(ISD::SHL, dl, PtrVT, Val, ShiftAmt); - } - - SDValue Store = DAG.getStore(Val.getValue(1), dl, Shifted, FIN, + SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo(FuncArg, ArgOffset), false, false, 0); MemOps.push_back(Store); ++GPR_idx; ArgOffset += PtrByteSize; } else { - ArgOffset += ArgSize - (ArgOffset-CurArgOffset); + ArgOffset += ArgSize - j; break; } } @@ -2102,18 +2357,10 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); - if (ObjectVT == MVT::i32) { + if (ObjectVT == MVT::i32) // PPC64 passes i8, i16, and i32 values in i64 registers. Promote // value to MVT::i64 and then truncate to the correct register size. - if (Flags.isSExt()) - ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal, - DAG.getValueType(ObjectVT)); - else if (Flags.isZExt()) - ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal, - DAG.getValueType(ObjectVT)); - - ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal); - } + ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); ++GPR_idx; } else { @@ -2142,6 +2389,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( ++FPR_idx; } else { needsLoad = true; + ArgSize = PtrByteSize; } ArgOffset += 8; @@ -2190,24 +2438,10 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( } // Set the size that is at least reserved in caller of this function. Tail - // call optimized function's reserved stack space needs to be aligned so that + // call optimized functions' reserved stack space needs to be aligned so that // taking the difference between two stack areas will result in an aligned // stack. - PPCFunctionInfo *FI = MF.getInfo(); - // Add the Altivec parameters at the end, if needed. - if (nAltivecParamsAtEnd) { - MinReservedArea = ((MinReservedArea+15)/16)*16; - MinReservedArea += 16*nAltivecParamsAtEnd; - } - MinReservedArea = - std::max(MinReservedArea, - PPCFrameLowering::getMinCallFrameSize(true, true)); - unsigned TargetAlign - = DAG.getMachineFunction().getTarget().getFrameLowering()-> - getStackAlignment(); - unsigned AlignMask = TargetAlign-1; - MinReservedArea = (MinReservedArea + AlignMask) & ~AlignMask; - FI->setMinReservedArea(MinReservedArea); + setMinReservedArea(MF, DAG, nAltivecParamsAtEnd, MinReservedArea, true); // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. @@ -2215,8 +2449,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( int Depth = ArgOffset; FuncInfo->setVarArgsFrameIndex( - MFI->CreateFixedObject(PtrVT.getSizeInBits()/8, - Depth, true)); + MFI->CreateFixedObject(PtrByteSize, Depth, true)); SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); // If this function is vararg, store any remaining integer argument regs @@ -2229,7 +2462,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( MachinePointerInfo(), false, false, 0); MemOps.push_back(Store); // Increment the address by four for the next argument to store - SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, PtrVT); + SDValue PtrOff = DAG.getConstant(PtrByteSize, PtrVT); FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); } } @@ -2345,6 +2578,9 @@ PPCTargetLowering::LowerFormalArguments_Darwin( SmallVector MemOps; unsigned nAltivecParamsAtEnd = 0; + // FIXME: FuncArg and Ins[ArgNo] must reference the same argument. + // When passing anonymous aggregates, this is currently not true. + // See LowerFormalArguments_64SVR4 for a fix. Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin(); for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo, ++FuncArg) { SDValue ArgVal; @@ -2394,8 +2630,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin( else VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); - EVT ObjType = (ObjSize == 1 ? MVT::i8 : - (ObjSize == 2 ? MVT::i16 : MVT::i32)); + EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16; SDValue Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo(FuncArg, CurArgOffset), @@ -2457,18 +2692,10 @@ PPCTargetLowering::LowerFormalArguments_Darwin( unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); - if (ObjectVT == MVT::i32) { + if (ObjectVT == MVT::i32) // PPC64 passes i8, i16, and i32 values in i64 registers. Promote // value to MVT::i64 and then truncate to the correct register size. - if (Flags.isSExt()) - ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal, - DAG.getValueType(ObjectVT)); - else if (Flags.isZExt()) - ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal, - DAG.getValueType(ObjectVT)); - - ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal); - } + ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); ++GPR_idx; } else { @@ -2555,23 +2782,10 @@ PPCTargetLowering::LowerFormalArguments_Darwin( } // Set the size that is at least reserved in caller of this function. Tail - // call optimized function's reserved stack space needs to be aligned so that + // call optimized functions' reserved stack space needs to be aligned so that // taking the difference between two stack areas will result in an aligned // stack. - PPCFunctionInfo *FI = MF.getInfo(); - // Add the Altivec parameters at the end, if needed. - if (nAltivecParamsAtEnd) { - MinReservedArea = ((MinReservedArea+15)/16)*16; - MinReservedArea += 16*nAltivecParamsAtEnd; - } - MinReservedArea = - std::max(MinReservedArea, - PPCFrameLowering::getMinCallFrameSize(isPPC64, true)); - unsigned TargetAlign = DAG.getMachineFunction().getTarget().getFrameLowering()-> - getStackAlignment(); - unsigned AlignMask = TargetAlign-1; - MinReservedArea = (MinReservedArea + AlignMask) & ~AlignMask; - FI->setMinReservedArea(MinReservedArea); + setMinReservedArea(MF, DAG, nAltivecParamsAtEnd, MinReservedArea, isPPC64); // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. @@ -2953,7 +3167,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, NodeTys.push_back(MVT::Other); // Returns a chain NodeTys.push_back(MVT::Glue); // Returns a flag for retval copy to use. - unsigned CallOpc = isSVR4ABI ? PPCISD::CALL_SVR4 : PPCISD::CALL_Darwin; + unsigned CallOpc = PPCISD::CALL; bool needIndirectCall = true; if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) { @@ -3021,7 +3235,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, // Thus for a call through a function pointer, the following actions need // to be performed: // 1. Save the TOC of the caller in the TOC save area of its stack - // frame (this is done in LowerCall_Darwin_Or_64SVR4()). + // frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()). // 2. Load the address of the function entry point from the function // descriptor. // 3. Load the TOC of the callee from the function descriptor into r2. @@ -3086,8 +3300,11 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, NodeTys.push_back(MVT::Other); NodeTys.push_back(MVT::Glue); Ops.push_back(Chain); - CallOpc = isSVR4ABI ? PPCISD::BCTRL_SVR4 : PPCISD::BCTRL_Darwin; + CallOpc = PPCISD::BCTRL; Callee.setNode(0); + // Add use of X11 (holding environment pointer) + if (isSVR4ABI && isPPC64) + Ops.push_back(DAG.getRegister(PPC::X11, PtrVT)); // Add CTR register as callee so a bctr can be emitted later. if (isTailCall) Ops.push_back(DAG.getRegister(isPPC64 ? PPC::CTR8 : PPC::CTR, PtrVT)); @@ -3135,12 +3352,32 @@ PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // Copy all of the result registers out of their specified physreg. for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; - EVT VT = VA.getValVT(); assert(VA.isRegLoc() && "Can only return in registers!"); - Chain = DAG.getCopyFromReg(Chain, dl, - VA.getLocReg(), VT, InFlag).getValue(1); - InVals.push_back(Chain.getValue(0)); - InFlag = Chain.getValue(2); + + SDValue Val = DAG.getCopyFromReg(Chain, dl, + VA.getLocReg(), VA.getLocVT(), InFlag); + Chain = Val.getValue(1); + InFlag = Val.getValue(2); + + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::AExt: + Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); + break; + case CCValAssign::ZExt: + Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val, + DAG.getValueType(VA.getValVT())); + Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); + break; + case CCValAssign::SExt: + Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val, + DAG.getValueType(VA.getValVT())); + Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); + break; + } + + InVals.push_back(Val); } return Chain; @@ -3169,7 +3406,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl, // When performing tail call optimization the callee pops its arguments off // the stack. Account for this here so these bytes can be pushed back on in - // PPCRegisterInfo::eliminateCallFramePseudoInstr. + // PPCFrameLowering::eliminateCallFramePseudoInstr. int BytesCalleePops = (CallConv == CallingConv::Fast && getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0; @@ -3185,17 +3422,6 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl, // Emit tail call. if (isTailCall) { - // If this is the first return lowered for this function, add the regs - // to the liveout set for the function. - if (DAG.getMachineFunction().getRegInfo().liveout_empty()) { - SmallVector RVLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); - CCInfo.AnalyzeCallResult(Ins, RetCC_PPC); - for (unsigned i = 0; i != RVLocs.size(); ++i) - DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg()); - } - assert(((Callee.getOpcode() == ISD::Register && cast(Callee)->getReg() == PPC::CTR) || Callee.getOpcode() == ISD::TargetExternalSymbol || @@ -3217,7 +3443,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl, bool needsTOCRestore = false; if (!isTailCall && PPCSubTarget.isSVR4ABI()&& PPCSubTarget.isPPC64()) { - if (CallOpc == PPCISD::BCTRL_SVR4) { + if (CallOpc == PPCISD::BCTRL) { // This is a call through a function pointer. // Restore the caller TOC from the save area into R2. // See PrepareCall() for more information about calls through function @@ -3228,9 +3454,9 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl, // from allocating it), resulting in an additional register being // allocated and an unnecessary move instruction being generated. needsTOCRestore = true; - } else if ((CallOpc == PPCISD::CALL_SVR4) && !isLocalCall(Callee)) { + } else if ((CallOpc == PPCISD::CALL) && !isLocalCall(Callee)) { // Otherwise insert NOP for non-local calls. - CallOpc = PPCISD::CALL_NOP_SVR4; + CallOpc = PPCISD::CALL_NOP; } } @@ -3271,14 +3497,20 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, Ins, DAG); - if (PPCSubTarget.isSVR4ABI() && !PPCSubTarget.isPPC64()) - return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg, - isTailCall, Outs, OutVals, Ins, - dl, DAG, InVals); + if (PPCSubTarget.isSVR4ABI()) { + if (PPCSubTarget.isPPC64()) + return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg, + isTailCall, Outs, OutVals, Ins, + dl, DAG, InVals); + else + return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg, + isTailCall, Outs, OutVals, Ins, + dl, DAG, InVals); + } - return LowerCall_Darwin_Or_64SVR4(Chain, Callee, CallConv, isVarArg, - isTailCall, Outs, OutVals, Ins, - dl, DAG, InVals); + return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg, + isTailCall, Outs, OutVals, Ins, + dl, DAG, InVals); } SDValue @@ -3333,11 +3565,11 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee, bool Result; if (Outs[i].IsFixed) { - Result = CC_PPC_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, - CCInfo); + Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, + CCInfo); } else { - Result = CC_PPC_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full, - ArgFlags, CCInfo); + Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full, + ArgFlags, CCInfo); } if (Result) { @@ -3350,7 +3582,7 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee, } } else { // All arguments are treated the same. - CCInfo.AnalyzeCallOperands(Outs, CC_PPC_SVR4); + CCInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4); } // Assign locations to all of the outgoing aggregate by value arguments. @@ -3361,7 +3593,7 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee, // Reserve stack space for the allocations in CCInfo. CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); - CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC_SVR4_ByVal); + CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal); // Size of the linkage area, parameter list area and the part of the local // space variable where copies of aggregates which are passed by value are @@ -3494,8 +3726,27 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee, Ins, InVals); } +// Copy an argument into memory, being careful to do this outside the +// call sequence for the call to which the argument belongs. +SDValue +PPCTargetLowering::createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff, + SDValue CallSeqStart, + ISD::ArgFlagsTy Flags, + SelectionDAG &DAG, + DebugLoc dl) const { + SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff, + CallSeqStart.getNode()->getOperand(0), + Flags, DAG, dl); + // The MEMCPY must go outside the CALLSEQ_START..END. + SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, + CallSeqStart.getNode()->getOperand(1)); + DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), + NewCallSeqStart.getNode()); + return NewCallSeqStart; +} + SDValue -PPCTargetLowering::LowerCall_Darwin_Or_64SVR4(SDValue Chain, SDValue Callee, +PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, bool isTailCall, const SmallVectorImpl &Outs, @@ -3504,13 +3755,10 @@ PPCTargetLowering::LowerCall_Darwin_Or_64SVR4(SDValue Chain, SDValue Callee, DebugLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { - bool isSVR4ABI = PPCSubTarget.isSVR4ABI(); - - unsigned NumOps = Outs.size(); + unsigned NumOps = Outs.size(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); - bool isPPC64 = PtrVT == MVT::i64; - unsigned PtrByteSize = isPPC64 ? 8 : 4; + unsigned PtrByteSize = 8; MachineFunction &MF = DAG.getMachineFunction(); @@ -3526,12 +3774,13 @@ PPCTargetLowering::LowerCall_Darwin_Or_64SVR4(SDValue Chain, SDValue Callee, unsigned nAltivecParamsAtEnd = 0; // Count how many bytes are to be pushed on the stack, including the linkage - // area, and parameter passing area. We start with 24/48 bytes, which is - // prereserved space for [SP][CR][LR][3 x unused]. + // area, and parameter passing area. We start with at least 48 bytes, which + // is reserved space for [SP][CR][LR][3 x unused]. + // NOTE: For PPC64, nAltivecParamsAtEnd always remains zero as a result + // of this call. unsigned NumBytes = - CalculateParameterAndLinkageAreaSize(DAG, isPPC64, isVarArg, CallConv, - Outs, OutVals, - nAltivecParamsAtEnd); + CalculateParameterAndLinkageAreaSize(DAG, true, isVarArg, CallConv, + Outs, OutVals, nAltivecParamsAtEnd); // Calculate by how many bytes the stack has to be adjusted in case of tail // call optimization. @@ -3556,24 +3805,16 @@ PPCTargetLowering::LowerCall_Darwin_Or_64SVR4(SDValue Chain, SDValue Callee, // Set up a copy of the stack pointer for use loading and storing any // arguments that may not fit in the registers available for argument // passing. - SDValue StackPtr; - if (isPPC64) - StackPtr = DAG.getRegister(PPC::X1, MVT::i64); - else - StackPtr = DAG.getRegister(PPC::R1, MVT::i32); + SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64); // Figure out which arguments are going to go in registers, and which in // memory. Also, if this is a vararg function, floating point operations // must be stored to our stack, and loaded into integer regs as well, if // any integer regs are available for argument passing. - unsigned ArgOffset = PPCFrameLowering::getLinkageSize(isPPC64, true); + unsigned ArgOffset = PPCFrameLowering::getLinkageSize(true, true); unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; - static const uint16_t GPR_32[] = { // 32-bit registers. - PPC::R3, PPC::R4, PPC::R5, PPC::R6, - PPC::R7, PPC::R8, PPC::R9, PPC::R10, - }; - static const uint16_t GPR_64[] = { // 64-bit registers. + static const uint16_t GPR[] = { PPC::X3, PPC::X4, PPC::X5, PPC::X6, PPC::X7, PPC::X8, PPC::X9, PPC::X10, }; @@ -3583,12 +3824,10 @@ PPCTargetLowering::LowerCall_Darwin_Or_64SVR4(SDValue Chain, SDValue Callee, PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 }; - const unsigned NumGPRs = array_lengthof(GPR_32); + const unsigned NumGPRs = array_lengthof(GPR); const unsigned NumFPRs = 13; const unsigned NumVRs = array_lengthof(VR); - const uint16_t *GPR = isPPC64 ? GPR_64 : GPR_32; - SmallVector, 8> RegsToPass; SmallVector TailCallArguments; @@ -3605,8 +3844,8 @@ PPCTargetLowering::LowerCall_Darwin_Or_64SVR4(SDValue Chain, SDValue Callee, PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); - // On PPC64, promote integers to 64-bit values. - if (isPPC64 && Arg.getValueType() == MVT::i32) { + // Promote integers to 64-bit values. + if (Arg.getValueType() == MVT::i32) { // FIXME: Should this use ANY_EXTEND if neither sext nor zext? unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg); @@ -3620,14 +3859,16 @@ PPCTargetLowering::LowerCall_Darwin_Or_64SVR4(SDValue Chain, SDValue Callee, // struct x { short a; char b; } // will have Size = 4. With #pragma pack(1), it will have Size = 3. // These are the proper values we need for right-justifying the - // aggregate in a parameter register for 64-bit SVR4. + // aggregate in a parameter register. unsigned Size = Flags.getByValSize(); - // FOR DARWIN ONLY: Very small objects are passed right-justified. - // Everything else is passed left-justified. - // FOR 64-BIT SVR4: All aggregates smaller than 8 bytes must - // be passed right-justified. - if (Size==1 || Size==2 || - (Size==4 && isSVR4ABI)) { + + // An empty aggregate parameter takes up no storage and no + // registers. + if (Size == 0) + continue; + + // All aggregates smaller than 8 bytes must be passed right-justified. + if (Size==1 || Size==2 || Size==4) { EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32); if (GPR_idx != NumGPRs) { SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, @@ -3637,20 +3878,18 @@ PPCTargetLowering::LowerCall_Darwin_Or_64SVR4(SDValue Chain, SDValue Callee, RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); ArgOffset += PtrByteSize; - } else { - SDValue Const = DAG.getConstant(4 - Size, PtrOff.getValueType()); - SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); - SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, AddPtr, - CallSeqStart.getNode()->getOperand(0), - Flags, DAG, dl); - // This must go outside the CALLSEQ_START..END. - SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, - CallSeqStart.getNode()->getOperand(1)); - DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), - NewCallSeqStart.getNode()); - Chain = CallSeqStart = NewCallSeqStart; - ArgOffset += PtrByteSize; + continue; } + } + + if (GPR_idx == NumGPRs && Size < 8) { + SDValue Const = DAG.getConstant(PtrByteSize - Size, + PtrOff.getValueType()); + SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); + Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, + CallSeqStart, + Flags, DAG, dl); + ArgOffset += PtrByteSize; continue; } // Copy entire object into memory. There are cases where gcc-generated @@ -3658,29 +3897,21 @@ PPCTargetLowering::LowerCall_Darwin_Or_64SVR4(SDValue Chain, SDValue Callee, // registers. (This is not what the doc says.) // FIXME: The above statement is likely due to a misunderstanding of the - // documents. At least for 64-bit SVR4, all arguments must be copied - // into the parameter area BY THE CALLEE in the event that the callee - // takes the address of any formal argument. That has not yet been - // implemented. However, it is reasonable to use the stack area as a - // staging area for the register load. - - // Skip this for small aggregates under 64-bit SVR4, as we will use - // the same slot for a right-justified copy, below. - if (Size >= 8 || !isSVR4ABI) { - SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff, - CallSeqStart.getNode()->getOperand(0), - Flags, DAG, dl); - // This must go outside the CALLSEQ_START..END. - SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, - CallSeqStart.getNode()->getOperand(1)); - DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), - NewCallSeqStart.getNode()); - Chain = CallSeqStart = NewCallSeqStart; - } - - // FOR 64-BIT SVR4: When a register is available, pass the - // aggregate right-justified. - if (isSVR4ABI && Size < 8 && GPR_idx != NumGPRs) { + // documents. All arguments must be copied into the parameter area BY + // THE CALLEE in the event that the callee takes the address of any + // formal argument. That has not yet been implemented. However, it is + // reasonable to use the stack area as a staging area for the register + // load. + + // Skip this for small aggregates, as we will use the same slot for a + // right-justified copy, below. + if (Size >= 8) + Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff, + CallSeqStart, + Flags, DAG, dl); + + // When a register is available, pass a small aggregate right-justified. + if (Size < 8 && GPR_idx != NumGPRs) { // The easiest way to get this right-justified in a register // is to copy the structure into the rightmost portion of a // local variable slot, then load the whole slot into the @@ -3691,16 +3922,9 @@ PPCTargetLowering::LowerCall_Darwin_Or_64SVR4(SDValue Chain, SDValue Callee, // parameter save area instead of a new local variable. SDValue Const = DAG.getConstant(8 - Size, PtrOff.getValueType()); SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); - SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, AddPtr, - CallSeqStart.getNode()->getOperand(0), - Flags, DAG, dl); - - // Place the memcpy outside the CALLSEQ_START..END. - SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, - CallSeqStart.getNode()->getOperand(1)); - DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), - NewCallSeqStart.getNode()); - Chain = CallSeqStart = NewCallSeqStart; + Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, + CallSeqStart, + Flags, DAG, dl); // Load the slot into the register. SDValue Load = DAG.getLoad(PtrVT, dl, Chain, PtrOff, @@ -3714,9 +3938,8 @@ PPCTargetLowering::LowerCall_Darwin_Or_64SVR4(SDValue Chain, SDValue Callee, continue; } - // For small aggregates (Darwin only) and aggregates >= PtrByteSize, - // copy the pieces of the object that fit into registers from the - // parameter save area. + // For aggregates larger than PtrByteSize, copy the pieces of the + // object that fit into registers from the parameter save area. for (unsigned j=0; j NumVRs) { - unsigned j = 0; - // Offset is aligned; skip 1st 12 params which go in V registers. - ArgOffset = ((ArgOffset+15)/16)*16; - ArgOffset += 12*16; - for (unsigned i = 0; i != NumOps; ++i) { - SDValue Arg = OutVals[i]; - EVT ArgType = Outs[i].VT; - if (ArgType==MVT::v4f32 || ArgType==MVT::v4i32 || - ArgType==MVT::v8i16 || ArgType==MVT::v16i8) { - if (++j > NumVRs) { - SDValue PtrOff; - // We are emitting Altivec params in order. - LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, - isPPC64, isTailCall, true, MemOpChains, - TailCallArguments, dl); - ArgOffset += 16; - } - } - } - } if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, @@ -3887,7 +4081,7 @@ PPCTargetLowering::LowerCall_Darwin_Or_64SVR4(SDValue Chain, SDValue Callee, // Check if this is an indirect call (MTCTR/BCTRL). // See PrepareCall() for more information about calls through function // pointers in the 64-bit SVR4 ABI. - if (!isTailCall && isPPC64 && PPCSubTarget.isSVR4ABI() && + if (!isTailCall && !dyn_cast(Callee) && !dyn_cast(Callee) && !isBLACompatibleAddress(Callee, DAG)) { @@ -3898,18 +4092,12 @@ PPCTargetLowering::LowerCall_Darwin_Or_64SVR4(SDValue Chain, SDValue Callee, SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, MachinePointerInfo(), false, false, 0); + // R12 must contain the address of an indirect callee. This does not + // mean the MTCTR instruction must use R12; it's easier to model this + // as an extra parameter, so do that. + RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee)); } - // On Darwin, R12 must contain the address of an indirect callee. This does - // not mean the MTCTR instruction must use R12; it's easier to model this as - // an extra parameter, so do that. - if (!isTailCall && - !dyn_cast(Callee) && - !dyn_cast(Callee) && - !isBLACompatibleAddress(Callee, DAG)) - RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 : - PPC::R12), Callee)); - // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; @@ -3920,7 +4108,355 @@ PPCTargetLowering::LowerCall_Darwin_Or_64SVR4(SDValue Chain, SDValue Callee, } if (isTailCall) - PrepareTailCall(DAG, InFlag, Chain, dl, isPPC64, SPDiff, NumBytes, LROp, + PrepareTailCall(DAG, InFlag, Chain, dl, true, SPDiff, NumBytes, LROp, + FPOp, true, TailCallArguments); + + return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG, + RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes, + Ins, InVals); +} + +SDValue +PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, + CallingConv::ID CallConv, bool isVarArg, + bool isTailCall, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + const SmallVectorImpl &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl &InVals) const { + + unsigned NumOps = Outs.size(); + + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + bool isPPC64 = PtrVT == MVT::i64; + unsigned PtrByteSize = isPPC64 ? 8 : 4; + + MachineFunction &MF = DAG.getMachineFunction(); + + // Mark this function as potentially containing a function that contains a + // tail call. As a consequence the frame pointer will be used for dynamicalloc + // and restoring the callers stack pointer in this functions epilog. This is + // done because by tail calling the called function might overwrite the value + // in this function's (MF) stack pointer stack slot 0(SP). + if (getTargetMachine().Options.GuaranteedTailCallOpt && + CallConv == CallingConv::Fast) + MF.getInfo()->setHasFastCall(); + + unsigned nAltivecParamsAtEnd = 0; + + // Count how many bytes are to be pushed on the stack, including the linkage + // area, and parameter passing area. We start with 24/48 bytes, which is + // prereserved space for [SP][CR][LR][3 x unused]. + unsigned NumBytes = + CalculateParameterAndLinkageAreaSize(DAG, isPPC64, isVarArg, CallConv, + Outs, OutVals, + nAltivecParamsAtEnd); + + // Calculate by how many bytes the stack has to be adjusted in case of tail + // call optimization. + int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); + + // To protect arguments on the stack from being clobbered in a tail call, + // force all the loads to happen before doing any other lowering. + if (isTailCall) + Chain = DAG.getStackArgumentTokenFactor(Chain); + + // Adjust the stack pointer for the new arguments... + // These operations are automatically eliminated by the prolog/epilog pass + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); + SDValue CallSeqStart = Chain; + + // Load the return address and frame pointer so it can be move somewhere else + // later. + SDValue LROp, FPOp; + Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, true, + dl); + + // Set up a copy of the stack pointer for use loading and storing any + // arguments that may not fit in the registers available for argument + // passing. + SDValue StackPtr; + if (isPPC64) + StackPtr = DAG.getRegister(PPC::X1, MVT::i64); + else + StackPtr = DAG.getRegister(PPC::R1, MVT::i32); + + // Figure out which arguments are going to go in registers, and which in + // memory. Also, if this is a vararg function, floating point operations + // must be stored to our stack, and loaded into integer regs as well, if + // any integer regs are available for argument passing. + unsigned ArgOffset = PPCFrameLowering::getLinkageSize(isPPC64, true); + unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; + + static const uint16_t GPR_32[] = { // 32-bit registers. + PPC::R3, PPC::R4, PPC::R5, PPC::R6, + PPC::R7, PPC::R8, PPC::R9, PPC::R10, + }; + static const uint16_t GPR_64[] = { // 64-bit registers. + PPC::X3, PPC::X4, PPC::X5, PPC::X6, + PPC::X7, PPC::X8, PPC::X9, PPC::X10, + }; + static const uint16_t *FPR = GetFPR(); + + static const uint16_t VR[] = { + PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, + PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 + }; + const unsigned NumGPRs = array_lengthof(GPR_32); + const unsigned NumFPRs = 13; + const unsigned NumVRs = array_lengthof(VR); + + const uint16_t *GPR = isPPC64 ? GPR_64 : GPR_32; + + SmallVector, 8> RegsToPass; + SmallVector TailCallArguments; + + SmallVector MemOpChains; + for (unsigned i = 0; i != NumOps; ++i) { + SDValue Arg = OutVals[i]; + ISD::ArgFlagsTy Flags = Outs[i].Flags; + + // PtrOff will be used to store the current argument to the stack if a + // register cannot be found for it. + SDValue PtrOff; + + PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType()); + + PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); + + // On PPC64, promote integers to 64-bit values. + if (isPPC64 && Arg.getValueType() == MVT::i32) { + // FIXME: Should this use ANY_EXTEND if neither sext nor zext? + unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg); + } + + // FIXME memcpy is used way more than necessary. Correctness first. + // Note: "by value" is code for passing a structure by value, not + // basic types. + if (Flags.isByVal()) { + unsigned Size = Flags.getByValSize(); + // Very small objects are passed right-justified. Everything else is + // passed left-justified. + if (Size==1 || Size==2) { + EVT VT = (Size==1) ? MVT::i8 : MVT::i16; + if (GPR_idx != NumGPRs) { + SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, + MachinePointerInfo(), VT, + false, false, 0); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + + ArgOffset += PtrByteSize; + } else { + SDValue Const = DAG.getConstant(PtrByteSize - Size, + PtrOff.getValueType()); + SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); + Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, + CallSeqStart, + Flags, DAG, dl); + ArgOffset += PtrByteSize; + } + continue; + } + // Copy entire object into memory. There are cases where gcc-generated + // code assumes it is there, even if it could be put entirely into + // registers. (This is not what the doc says.) + Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff, + CallSeqStart, + Flags, DAG, dl); + + // For small aggregates (Darwin only) and aggregates >= PtrByteSize, + // copy the pieces of the object that fit into registers from the + // parameter save area. + for (unsigned j=0; j NumVRs) { + unsigned j = 0; + // Offset is aligned; skip 1st 12 params which go in V registers. + ArgOffset = ((ArgOffset+15)/16)*16; + ArgOffset += 12*16; + for (unsigned i = 0; i != NumOps; ++i) { + SDValue Arg = OutVals[i]; + EVT ArgType = Outs[i].VT; + if (ArgType==MVT::v4f32 || ArgType==MVT::v4i32 || + ArgType==MVT::v8i16 || ArgType==MVT::v16i8) { + if (++j > NumVRs) { + SDValue PtrOff; + // We are emitting Altivec params in order. + LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, + isPPC64, isTailCall, true, MemOpChains, + TailCallArguments, dl); + ArgOffset += 16; + } + } + } + } + + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + &MemOpChains[0], MemOpChains.size()); + + // On Darwin, R12 must contain the address of an indirect callee. This does + // not mean the MTCTR instruction must use R12; it's easier to model this as + // an extra parameter, so do that. + if (!isTailCall && + !dyn_cast(Callee) && + !dyn_cast(Callee) && + !isBLACompatibleAddress(Callee, DAG)) + RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 : + PPC::R12), Callee)); + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into the appropriate regs. + SDValue InFlag; + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + + if (isTailCall) + PrepareTailCall(DAG, InFlag, Chain, dl, isPPC64, SPDiff, NumBytes, LROp, FPOp, true, TailCallArguments); return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG, @@ -3951,28 +4487,43 @@ PPCTargetLowering::LowerReturn(SDValue Chain, getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_PPC); - // If this is the first return lowered for this function, add the regs to the - // liveout set for the function. - if (DAG.getMachineFunction().getRegInfo().liveout_empty()) { - for (unsigned i = 0; i != RVLocs.size(); ++i) - DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg()); - } - SDValue Flag; + SmallVector RetOps(1, Chain); // Copy the result values into the output registers. for (unsigned i = 0; i != RVLocs.size(); ++i) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); - Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), - OutVals[i], Flag); + + SDValue Arg = OutVals[i]; + + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::AExt: + Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); + break; + case CCValAssign::ZExt: + Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); + break; + case CCValAssign::SExt: + Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); + break; + } + + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); Flag = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } + RetOps[0] = Chain; // Update chain. + + // Add the flag if we have it. if (Flag.getNode()) - return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, Chain, Flag); - else - return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, Chain); + RetOps.push_back(Flag); + + return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, + &RetOps[0], RetOps.size()); } SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG, @@ -4078,6 +4629,21 @@ SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops, 3); } +SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + return DAG.getNode(PPCISD::EH_SJLJ_SETJMP, DL, + DAG.getVTList(MVT::i32, MVT::Other), + Op.getOperand(0), Op.getOperand(1)); +} + +SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + return DAG.getNode(PPCISD::EH_SJLJ_LONGJMP, DL, MVT::Other, + Op.getOperand(0), Op.getOperand(1)); +} + /// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when /// possible. SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { @@ -4165,76 +4731,182 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!"); case MVT::i32: Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIWZ : - PPCISD::FCTIDZ, + (PPCSubTarget.hasFPCVT() ? PPCISD::FCTIWUZ : + PPCISD::FCTIDZ), dl, MVT::f64, Src); break; case MVT::i64: - Tmp = DAG.getNode(PPCISD::FCTIDZ, dl, MVT::f64, Src); + assert((Op.getOpcode() == ISD::SINT_TO_FP || PPCSubTarget.hasFPCVT()) && + "i64 UINT_TO_FP is supported only with FPCVT"); + Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ : + PPCISD::FCTIDUZ, + dl, MVT::f64, Src); break; } // Convert the FP value to an int value through memory. - SDValue FIPtr = DAG.CreateStackTemporary(MVT::f64); + bool i32Stack = Op.getValueType() == MVT::i32 && PPCSubTarget.hasSTFIWX() && + (Op.getOpcode() == ISD::FP_TO_SINT || PPCSubTarget.hasFPCVT()); + SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64); + int FI = cast(FIPtr)->getIndex(); + MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(FI); // Emit a store to the stack slot. - SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr, - MachinePointerInfo(), false, false, 0); + SDValue Chain; + if (i32Stack) { + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *MMO = + MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, 4); + SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr }; + Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl, + DAG.getVTList(MVT::Other), Ops, array_lengthof(Ops), + MVT::i32, MMO); + } else + Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr, + MPI, false, false, 0); // Result is a load from the stack slot. If loading 4 bytes, make sure to // add in a bias. - if (Op.getValueType() == MVT::i32) + if (Op.getValueType() == MVT::i32 && !i32Stack) { FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr, DAG.getConstant(4, FIPtr.getValueType())); - return DAG.getLoad(Op.getValueType(), dl, Chain, FIPtr, MachinePointerInfo(), + MPI = MachinePointerInfo(); + } + + return DAG.getLoad(Op.getValueType(), dl, Chain, FIPtr, MPI, false, false, false, 0); } -SDValue PPCTargetLowering::LowerSINT_TO_FP(SDValue Op, +SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); // Don't handle ppc_fp128 here; let it be lowered to a libcall. if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) return SDValue(); + assert((Op.getOpcode() == ISD::SINT_TO_FP || PPCSubTarget.hasFPCVT()) && + "UINT_TO_FP is supported only with FPCVT"); + + // If we have FCFIDS, then use it when converting to single-precision. + // Otherwise, convert to double-prcision and then round. + unsigned FCFOp = (PPCSubTarget.hasFPCVT() && Op.getValueType() == MVT::f32) ? + (Op.getOpcode() == ISD::UINT_TO_FP ? + PPCISD::FCFIDUS : PPCISD::FCFIDS) : + (Op.getOpcode() == ISD::UINT_TO_FP ? + PPCISD::FCFIDU : PPCISD::FCFID); + MVT FCFTy = (PPCSubTarget.hasFPCVT() && Op.getValueType() == MVT::f32) ? + MVT::f32 : MVT::f64; + if (Op.getOperand(0).getValueType() == MVT::i64) { - SDValue Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op.getOperand(0)); - SDValue FP = DAG.getNode(PPCISD::FCFID, dl, MVT::f64, Bits); - if (Op.getValueType() == MVT::f32) + SDValue SINT = Op.getOperand(0); + // When converting to single-precision, we actually need to convert + // to double-precision first and then round to single-precision. + // To avoid double-rounding effects during that operation, we have + // to prepare the input operand. Bits that might be truncated when + // converting to double-precision are replaced by a bit that won't + // be lost at this stage, but is below the single-precision rounding + // position. + // + // However, if -enable-unsafe-fp-math is in effect, accept double + // rounding to avoid the extra overhead. + if (Op.getValueType() == MVT::f32 && + !PPCSubTarget.hasFPCVT() && + !DAG.getTarget().Options.UnsafeFPMath) { + + // Twiddle input to make sure the low 11 bits are zero. (If this + // is the case, we are guaranteed the value will fit into the 53 bit + // mantissa of an IEEE double-precision value without rounding.) + // If any of those low 11 bits were not zero originally, make sure + // bit 12 (value 2048) is set instead, so that the final rounding + // to single-precision gets the correct result. + SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64, + SINT, DAG.getConstant(2047, MVT::i64)); + Round = DAG.getNode(ISD::ADD, dl, MVT::i64, + Round, DAG.getConstant(2047, MVT::i64)); + Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT); + Round = DAG.getNode(ISD::AND, dl, MVT::i64, + Round, DAG.getConstant(-2048, MVT::i64)); + + // However, we cannot use that value unconditionally: if the magnitude + // of the input value is small, the bit-twiddling we did above might + // end up visibly changing the output. Fortunately, in that case, we + // don't need to twiddle bits since the original input will convert + // exactly to double-precision floating-point already. Therefore, + // construct a conditional to use the original value if the top 11 + // bits are all sign-bit copies, and use the rounded value computed + // above otherwise. + SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64, + SINT, DAG.getConstant(53, MVT::i32)); + Cond = DAG.getNode(ISD::ADD, dl, MVT::i64, + Cond, DAG.getConstant(1, MVT::i64)); + Cond = DAG.getSetCC(dl, MVT::i32, + Cond, DAG.getConstant(1, MVT::i64), ISD::SETUGT); + + SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT); + } + + SDValue Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT); + SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits); + + if (Op.getValueType() == MVT::f32 && !PPCSubTarget.hasFPCVT()) FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, DAG.getIntPtrConstant(0)); return FP; } assert(Op.getOperand(0).getValueType() == MVT::i32 && - "Unhandled SINT_TO_FP type in custom expander!"); + "Unhandled INT_TO_FP type in custom expander!"); // Since we only generate this in 64-bit mode, we can take advantage of // 64-bit registers. In particular, sign extend the input value into the // 64-bit register with extsw, store the WHOLE 64-bit value into the stack // then lfd it and fcfid it. MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *FrameInfo = MF.getFrameInfo(); - int FrameIdx = FrameInfo->CreateStackObject(8, 8, false); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); - SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); - SDValue Ext64 = DAG.getNode(PPCISD::EXTSW_32, dl, MVT::i32, + SDValue Ld; + if (PPCSubTarget.hasLFIWAX() || PPCSubTarget.hasFPCVT()) { + int FrameIdx = FrameInfo->CreateStackObject(4, 4, false); + SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); + + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx, + MachinePointerInfo::getFixedStack(FrameIdx), + false, false, 0); + + assert(cast(Store)->getMemoryVT() == MVT::i32 && + "Expected an i32 store"); + MachineMemOperand *MMO = + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx), + MachineMemOperand::MOLoad, 4, 4); + SDValue Ops[] = { Store, FIdx }; + Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ? + PPCISD::LFIWZX : PPCISD::LFIWAX, + dl, DAG.getVTList(MVT::f64, MVT::Other), + Ops, 2, MVT::i32, MMO); + } else { + assert(PPCSubTarget.isPPC64() && + "i32->FP without LFIWAX supported only on PPC64"); + + int FrameIdx = FrameInfo->CreateStackObject(8, 8, false); + SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); + + SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64, Op.getOperand(0)); - // STD the extended value into the stack slot. - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx), - MachineMemOperand::MOStore, 8, 8); - SDValue Ops[] = { DAG.getEntryNode(), Ext64, FIdx }; - SDValue Store = - DAG.getMemIntrinsicNode(PPCISD::STD_32, dl, DAG.getVTList(MVT::Other), - Ops, 4, MVT::i64, MMO); - // Load the value as a double. - SDValue Ld = DAG.getLoad(MVT::f64, dl, Store, FIdx, MachinePointerInfo(), - false, false, false, 0); + // STD the extended value into the stack slot. + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Ext64, FIdx, + MachinePointerInfo::getFixedStack(FrameIdx), + false, false, 0); + + // Load the value as a double. + Ld = DAG.getLoad(MVT::f64, dl, Store, FIdx, + MachinePointerInfo::getFixedStack(FrameIdx), + false, false, false, 0); + } // FCFID it and return it. - SDValue FP = DAG.getNode(PPCISD::FCFID, dl, MVT::f64, Ld); - if (Op.getValueType() == MVT::f32) + SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld); + if (Op.getValueType() == MVT::f32 && !PPCSubTarget.hasFPCVT()) FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, DAG.getIntPtrConstant(0)); return FP; } @@ -4264,12 +4936,13 @@ SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op, MachineFunction &MF = DAG.getMachineFunction(); EVT VT = Op.getValueType(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); - std::vector NodeTys; SDValue MFFSreg, InFlag; // Save FP Control Word to register - NodeTys.push_back(MVT::f64); // return register - NodeTys.push_back(MVT::Glue); // unused in this context + EVT NodeTys[] = { + MVT::f64, // return register + MVT::Glue // unused in this context + }; SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, &InFlag, 0); // Save FP register to stack slot @@ -4503,11 +5176,21 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, // Two instruction sequences. // If this value is in the range [-32,30] and is even, use: - // tmp = VSPLTI[bhw], result = add tmp, tmp - if (SextVal >= -32 && SextVal <= 30 && (SextVal & 1) == 0) { - SDValue Res = BuildSplatI(SextVal >> 1, SplatSize, MVT::Other, DAG, dl); - Res = DAG.getNode(ISD::ADD, dl, Res.getValueType(), Res, Res); - return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); + // VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2) + // If this value is in the range [17,31] and is odd, use: + // VSPLTI[bhw](val-16) - VSPLTI[bhw](-16) + // If this value is in the range [-31,-17] and is odd, use: + // VSPLTI[bhw](val+16) + VSPLTI[bhw](-16) + // Note the last two are three-instruction sequences. + if (SextVal >= -32 && SextVal <= 31) { + // To avoid having these optimizations undone by constant folding, + // we convert to a pseudo that will be expanded later into one of + // the above forms. + SDValue Elt = DAG.getConstant(SextVal, MVT::i32); + EVT VT = Op.getValueType(); + int Size = VT == MVT::v16i8 ? 1 : (VT == MVT::v8i16 ? 2 : 4); + SDValue EltSize = DAG.getConstant(Size, MVT::i32); + return DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize); } // If this is 0x8000_0000 x 4, turn into vspltisw + vslw. If it is @@ -4603,23 +5286,6 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, } } - // Three instruction sequences. - - // Odd, in range [17,31]: (vsplti C)-(vsplti -16). - if (SextVal >= 0 && SextVal <= 31) { - SDValue LHS = BuildSplatI(SextVal-16, SplatSize, MVT::Other, DAG, dl); - SDValue RHS = BuildSplatI(-16, SplatSize, MVT::Other, DAG, dl); - LHS = DAG.getNode(ISD::SUB, dl, LHS.getValueType(), LHS, RHS); - return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), LHS); - } - // Odd, in range [-31,-17]: (vsplti C)+(vsplti -16). - if (SextVal >= -31 && SextVal <= 0) { - SDValue LHS = BuildSplatI(SextVal+16, SplatSize, MVT::Other, DAG, dl); - SDValue RHS = BuildSplatI(-16, SplatSize, MVT::Other, DAG, dl); - LHS = DAG.getNode(ISD::ADD, dl, LHS.getValueType(), LHS, RHS); - return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), LHS); - } - return SDValue(); } @@ -4893,9 +5559,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(3), // RHS DAG.getConstant(CompareOpc, MVT::i32) }; - std::vector VTs; - VTs.push_back(Op.getOperand(2).getValueType()); - VTs.push_back(MVT::Glue); + EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue }; SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops, 3); // Now that we have the comparison, emit a copy from the CR to a GPR. @@ -5037,11 +5701,15 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG, PPCSubTarget); + case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); + case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); + case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::FP_TO_UINT: case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, Op.getDebugLoc()); - case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); + case ISD::UINT_TO_FP: + case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); // Lower 64-bit shifts. @@ -5095,50 +5763,8 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, MVT::f64, N->getOperand(0), DAG.getIntPtrConstant(1)); - // This sequence changes FPSCR to do round-to-zero, adds the two halves - // of the long double, and puts FPSCR back the way it was. We do not - // actually model FPSCR. - std::vector NodeTys; - SDValue Ops[4], Result, MFFSreg, InFlag, FPreg; - - NodeTys.push_back(MVT::f64); // Return register - NodeTys.push_back(MVT::Glue); // Returns a flag for later insns - Result = DAG.getNode(PPCISD::MFFS, dl, NodeTys, &InFlag, 0); - MFFSreg = Result.getValue(0); - InFlag = Result.getValue(1); - - NodeTys.clear(); - NodeTys.push_back(MVT::Glue); // Returns a flag - Ops[0] = DAG.getConstant(31, MVT::i32); - Ops[1] = InFlag; - Result = DAG.getNode(PPCISD::MTFSB1, dl, NodeTys, Ops, 2); - InFlag = Result.getValue(0); - - NodeTys.clear(); - NodeTys.push_back(MVT::Glue); // Returns a flag - Ops[0] = DAG.getConstant(30, MVT::i32); - Ops[1] = InFlag; - Result = DAG.getNode(PPCISD::MTFSB0, dl, NodeTys, Ops, 2); - InFlag = Result.getValue(0); - - NodeTys.clear(); - NodeTys.push_back(MVT::f64); // result of add - NodeTys.push_back(MVT::Glue); // Returns a flag - Ops[0] = Lo; - Ops[1] = Hi; - Ops[2] = InFlag; - Result = DAG.getNode(PPCISD::FADDRTZ, dl, NodeTys, Ops, 3); - FPreg = Result.getValue(0); - InFlag = Result.getValue(1); - - NodeTys.clear(); - NodeTys.push_back(MVT::f64); - Ops[0] = DAG.getConstant(1, MVT::i32); - Ops[1] = MFFSreg; - Ops[2] = FPreg; - Ops[3] = InFlag; - Result = DAG.getNode(PPCISD::MTFSF, dl, NodeTys, Ops, 4); - FPreg = Result.getValue(0); + // Add the two halves of the long double in round-to-zero mode. + SDValue FPreg = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi); // We know the low half is about to be thrown away, so just use something // convenient. @@ -5230,7 +5856,7 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI, // registers without caring whether they're 32 or 64, but here we're // doing actual arithmetic on the addresses. bool is64bit = PPCSubTarget.isPPC64(); - unsigned ZeroReg = is64bit ? PPC::X0 : PPC::R0; + unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction *F = BB->getParent(); @@ -5349,9 +5975,238 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI, return BB; } +llvm::MachineBasicBlock* +PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, + MachineBasicBlock *MBB) const { + DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + const BasicBlock *BB = MBB->getBasicBlock(); + MachineFunction::iterator I = MBB; + ++I; + + // Memory Reference + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + + unsigned DstReg = MI->getOperand(0).getReg(); + const TargetRegisterClass *RC = MRI.getRegClass(DstReg); + assert(RC->hasType(MVT::i32) && "Invalid destination!"); + unsigned mainDstReg = MRI.createVirtualRegister(RC); + unsigned restoreDstReg = MRI.createVirtualRegister(RC); + + MVT PVT = getPointerTy(); + assert((PVT == MVT::i64 || PVT == MVT::i32) && + "Invalid Pointer Size!"); + // For v = setjmp(buf), we generate + // + // thisMBB: + // SjLjSetup mainMBB + // bl mainMBB + // v_restore = 1 + // b sinkMBB + // + // mainMBB: + // buf[LabelOffset] = LR + // v_main = 0 + // + // sinkMBB: + // v = phi(main, restore) + // + + MachineBasicBlock *thisMBB = MBB; + MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); + MF->insert(I, mainMBB); + MF->insert(I, sinkMBB); + + MachineInstrBuilder MIB; + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), MBB, + llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); + + // Note that the structure of the jmp_buf used here is not compatible + // with that used by libc, and is not designed to be. Specifically, it + // stores only those 'reserved' registers that LLVM does not otherwise + // understand how to spill. Also, by convention, by the time this + // intrinsic is called, Clang has already stored the frame address in the + // first slot of the buffer and stack address in the third. Following the + // X86 target code, we'll store the jump address in the second slot. We also + // need to save the TOC pointer (R2) to handle jumps between shared + // libraries, and that will be stored in the fourth slot. The thread + // identifier (R13) is not affected. + + // thisMBB: + const int64_t LabelOffset = 1 * PVT.getStoreSize(); + const int64_t TOCOffset = 3 * PVT.getStoreSize(); + + // Prepare IP either in reg. + const TargetRegisterClass *PtrRC = getRegClassFor(PVT); + unsigned LabelReg = MRI.createVirtualRegister(PtrRC); + unsigned BufReg = MI->getOperand(1).getReg(); + + if (PPCSubTarget.isPPC64() && PPCSubTarget.isSVR4ABI()) { + MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD)) + .addReg(PPC::X2) + .addImm(TOCOffset / 4) + .addReg(BufReg); + + MIB.setMemRefs(MMOBegin, MMOEnd); + } + + // Setup + MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCL)).addMBB(mainMBB); + MIB.addRegMask(PPCRegInfo->getNoPreservedMask()); + + BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1); + + MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::EH_SjLj_Setup)) + .addMBB(mainMBB); + MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB); + + thisMBB->addSuccessor(mainMBB, /* weight */ 0); + thisMBB->addSuccessor(sinkMBB, /* weight */ 1); + + // mainMBB: + // mainDstReg = 0 + MIB = BuildMI(mainMBB, DL, + TII->get(PPCSubTarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg); + + // Store IP + if (PPCSubTarget.isPPC64()) { + MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD)) + .addReg(LabelReg) + .addImm(LabelOffset / 4) + .addReg(BufReg); + } else { + MIB = BuildMI(mainMBB, DL, TII->get(PPC::STW)) + .addReg(LabelReg) + .addImm(LabelOffset) + .addReg(BufReg); + } + + MIB.setMemRefs(MMOBegin, MMOEnd); + + BuildMI(mainMBB, DL, TII->get(PPC::LI), mainDstReg).addImm(0); + mainMBB->addSuccessor(sinkMBB); + + // sinkMBB: + BuildMI(*sinkMBB, sinkMBB->begin(), DL, + TII->get(PPC::PHI), DstReg) + .addReg(mainDstReg).addMBB(mainMBB) + .addReg(restoreDstReg).addMBB(thisMBB); + + MI->eraseFromParent(); + return sinkMBB; +} + +MachineBasicBlock * +PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, + MachineBasicBlock *MBB) const { + DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + // Memory Reference + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + + MVT PVT = getPointerTy(); + assert((PVT == MVT::i64 || PVT == MVT::i32) && + "Invalid Pointer Size!"); + + const TargetRegisterClass *RC = + (PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; + unsigned Tmp = MRI.createVirtualRegister(RC); + // Since FP is only updated here but NOT referenced, it's treated as GPR. + unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31; + unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1; + + MachineInstrBuilder MIB; + + const int64_t LabelOffset = 1 * PVT.getStoreSize(); + const int64_t SPOffset = 2 * PVT.getStoreSize(); + const int64_t TOCOffset = 3 * PVT.getStoreSize(); + + unsigned BufReg = MI->getOperand(0).getReg(); + + // Reload FP (the jumped-to function may not have had a + // frame pointer, and if so, then its r31 will be restored + // as necessary). + if (PVT == MVT::i64) { + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), FP) + .addImm(0) + .addReg(BufReg); + } else { + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), FP) + .addImm(0) + .addReg(BufReg); + } + MIB.setMemRefs(MMOBegin, MMOEnd); + + // Reload IP + if (PVT == MVT::i64) { + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), Tmp) + .addImm(LabelOffset / 4) + .addReg(BufReg); + } else { + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), Tmp) + .addImm(LabelOffset) + .addReg(BufReg); + } + MIB.setMemRefs(MMOBegin, MMOEnd); + + // Reload SP + if (PVT == MVT::i64) { + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), SP) + .addImm(SPOffset / 4) + .addReg(BufReg); + } else { + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), SP) + .addImm(SPOffset) + .addReg(BufReg); + } + MIB.setMemRefs(MMOBegin, MMOEnd); + + // FIXME: When we also support base pointers, that register must also be + // restored here. + + // Reload TOC + if (PVT == MVT::i64 && PPCSubTarget.isSVR4ABI()) { + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2) + .addImm(TOCOffset / 4) + .addReg(BufReg); + + MIB.setMemRefs(MMOBegin, MMOEnd); + } + + // Jump + BuildMI(*MBB, MI, DL, + TII->get(PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(Tmp); + BuildMI(*MBB, MI, DL, TII->get(PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR)); + + MI->eraseFromParent(); + return MBB; +} + MachineBasicBlock * PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { + if (MI->getOpcode() == PPC::EH_SjLj_SetJmp32 || + MI->getOpcode() == PPC::EH_SjLj_SetJmp64) { + return emitEHSjLjSetJmp(MI, BB); + } else if (MI->getOpcode() == PPC::EH_SjLj_LongJmp32 || + MI->getOpcode() == PPC::EH_SjLj_LongJmp64) { + return emitEHSjLjLongJmp(MI, BB); + } + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); // To "insert" these instructions we actually have to insert their @@ -5369,24 +6224,24 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, unsigned SelectPred = MI->getOperand(4).getImm(); DebugLoc dl = MI->getDebugLoc(); - // The SelectPred is ((BI << 5) | BO) for a BCC - unsigned BO = SelectPred & 0xF; - assert((BO == 12 || BO == 4) && "invalid predicate BO field for isel"); - - unsigned TrueOpNo, FalseOpNo; - if (BO == 12) { - TrueOpNo = 2; - FalseOpNo = 3; - } else { - TrueOpNo = 3; - FalseOpNo = 2; - SelectPred = PPC::InvertPredicate((PPC::Predicate)SelectPred); + unsigned SubIdx; + bool SwapOps; + switch (SelectPred) { + default: llvm_unreachable("invalid predicate for isel"); + case PPC::PRED_EQ: SubIdx = PPC::sub_eq; SwapOps = false; break; + case PPC::PRED_NE: SubIdx = PPC::sub_eq; SwapOps = true; break; + case PPC::PRED_LT: SubIdx = PPC::sub_lt; SwapOps = false; break; + case PPC::PRED_GE: SubIdx = PPC::sub_lt; SwapOps = true; break; + case PPC::PRED_GT: SubIdx = PPC::sub_gt; SwapOps = false; break; + case PPC::PRED_LE: SubIdx = PPC::sub_gt; SwapOps = true; break; + case PPC::PRED_UN: SubIdx = PPC::sub_un; SwapOps = false; break; + case PPC::PRED_NU: SubIdx = PPC::sub_un; SwapOps = true; break; } BuildMI(*BB, MI, dl, TII->get(OpCode), MI->getOperand(0).getReg()) - .addReg(MI->getOperand(TrueOpNo).getReg()) - .addReg(MI->getOperand(FalseOpNo).getReg()) - .addImm(SelectPred).addReg(MI->getOperand(1).getReg()); + .addReg(MI->getOperand(SwapOps? 3 : 2).getReg()) + .addReg(MI->getOperand(SwapOps? 2 : 3).getReg()) + .addReg(MI->getOperand(1).getReg(), 0, SubIdx); } else if (MI->getOpcode() == PPC::SELECT_CC_I4 || MI->getOpcode() == PPC::SELECT_CC_I8 || MI->getOpcode() == PPC::SELECT_CC_F4 || @@ -5619,7 +6474,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, unsigned TmpDestReg = RegInfo.createVirtualRegister(RC); unsigned Ptr1Reg; unsigned TmpReg = RegInfo.createVirtualRegister(RC); - unsigned ZeroReg = is64bit ? PPC::X0 : PPC::R0; + unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; // thisMBB: // ... // fallthrough --> loopMBB @@ -5722,6 +6577,75 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, BB = exitMBB; BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW),dest).addReg(TmpReg) .addReg(ShiftReg); + } else if (MI->getOpcode() == PPC::FADDrtz) { + // This pseudo performs an FADD with rounding mode temporarily forced + // to round-to-zero. We emit this via custom inserter since the FPSCR + // is not modeled at the SelectionDAG level. + unsigned Dest = MI->getOperand(0).getReg(); + unsigned Src1 = MI->getOperand(1).getReg(); + unsigned Src2 = MI->getOperand(2).getReg(); + DebugLoc dl = MI->getDebugLoc(); + + MachineRegisterInfo &RegInfo = F->getRegInfo(); + unsigned MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass); + + // Save FPSCR value. + BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg); + + // Set rounding mode to round-to-zero. + BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1)).addImm(31); + BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0)).addImm(30); + + // Perform addition. + BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest).addReg(Src1).addReg(Src2); + + // Restore FPSCR value. + BuildMI(*BB, MI, dl, TII->get(PPC::MTFSF)).addImm(1).addReg(MFFSReg); + } else if (MI->getOpcode() == PPC::FRINDrint || + MI->getOpcode() == PPC::FRINSrint) { + bool isf32 = MI->getOpcode() == PPC::FRINSrint; + unsigned Dest = MI->getOperand(0).getReg(); + unsigned Src = MI->getOperand(1).getReg(); + DebugLoc dl = MI->getDebugLoc(); + + MachineRegisterInfo &RegInfo = F->getRegInfo(); + unsigned CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); + + // Perform the rounding. + BuildMI(*BB, MI, dl, TII->get(isf32 ? PPC::FRINS : PPC::FRIND), Dest) + .addReg(Src); + + // Compare the results. + BuildMI(*BB, MI, dl, TII->get(isf32 ? PPC::FCMPUS : PPC::FCMPUD), CRReg) + .addReg(Dest).addReg(Src); + + // If the results were not equal, then set the FPSCR XX bit. + MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(It, midMBB); + F->insert(It, exitMBB); + exitMBB->splice(exitMBB->begin(), BB, + llvm::next(MachineBasicBlock::iterator(MI)), + BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); + + BuildMI(*BB, MI, dl, TII->get(PPC::BCC)) + .addImm(PPC::PRED_EQ).addReg(CRReg).addMBB(exitMBB); + + BB->addSuccessor(midMBB); + BB->addSuccessor(exitMBB); + + BB = midMBB; + + // Set the FPSCR XX bit (FE_INEXACT). Note that we cannot just set + // the FI bit here because that will not automatically set XX also, + // and XX is what libm interprets as the FE_INEXACT flag. + BuildMI(BB, dl, TII->get(PPC::MTFSB1)).addImm(/* 38 - 32 = */ 6); + BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); + + BB->addSuccessor(exitMBB); + + BB = exitMBB; } else { llvm_unreachable("Unexpected instr type to insert"); } @@ -5807,8 +6731,15 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, Val = DAG.getNode(PPCISD::FCTIWZ, dl, MVT::f64, Val); DCI.AddToWorklist(Val.getNode()); - Val = DAG.getNode(PPCISD::STFIWX, dl, MVT::Other, N->getOperand(0), Val, - N->getOperand(2), N->getOperand(3)); + SDValue Ops[] = { + N->getOperand(0), Val, N->getOperand(2), + DAG.getValueType(N->getOperand(1).getValueType()) + }; + + Val = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl, + DAG.getVTList(MVT::Other), Ops, array_lengthof(Ops), + cast(N)->getMemoryVT(), + cast(N)->getMemOperand()); DCI.AddToWorklist(Val.getNode()); return Val; } @@ -5818,7 +6749,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, N->getOperand(1).getOpcode() == ISD::BSWAP && N->getOperand(1).getNode()->hasOneUse() && (N->getOperand(1).getValueType() == MVT::i32 || - N->getOperand(1).getValueType() == MVT::i16)) { + N->getOperand(1).getValueType() == MVT::i16 || + (TM.getSubtarget().hasLDBRX() && + TM.getSubtarget().isPPC64() && + N->getOperand(1).getValueType() == MVT::i64))) { SDValue BSwapOp = N->getOperand(1).getOperand(0); // Do an any-extend to 32-bits if this is a half-word input. if (BSwapOp.getValueType() == MVT::i16) @@ -5839,7 +6773,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, // Turn BSWAP (LOAD) -> lhbrx/lwbrx. if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) && N->getOperand(0).hasOneUse() && - (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16)) { + (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16 || + (TM.getSubtarget().hasLDBRX() && + TM.getSubtarget().isPPC64() && + N->getValueType(0) == MVT::i64))) { SDValue Load = N->getOperand(0); LoadSDNode *LD = cast(Load); // Create the byte-swapping load. @@ -5850,8 +6787,9 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, }; SDValue BSLoad = DAG.getMemIntrinsicNode(PPCISD::LBRX, dl, - DAG.getVTList(MVT::i32, MVT::Other), Ops, 3, - LD->getMemoryVT(), LD->getMemOperand()); + DAG.getVTList(N->getValueType(0) == MVT::i64 ? + MVT::i64 : MVT::i32, MVT::Other), + Ops, 3, LD->getMemoryVT(), LD->getMemOperand()); // If this is an i16 load, insert the truncate. SDValue ResVal = BSLoad; @@ -5951,14 +6889,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0); // Create the PPCISD altivec 'dot' comparison node. - std::vector VTs; SDValue Ops[] = { LHS.getOperand(2), // LHS of compare LHS.getOperand(3), // RHS of compare DAG.getConstant(CompareOpc, MVT::i32) }; - VTs.push_back(LHS.getOperand(2).getValueType()); - VTs.push_back(MVT::Glue); + EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue }; SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops, 3); // Unpack the result based on how the target uses it. @@ -6046,6 +6982,14 @@ PPCTargetLowering::getConstraintType(const std::string &Constraint) const { case 'v': case 'y': return C_RegisterClass; + case 'Z': + // FIXME: While Z does indicate a memory constraint, it specifically + // indicates an r+r address (used in conjunction with the 'y' modifier + // in the replacement string). Currently, we're forcing the base + // register to be r0 in the asm printer (which is interpreted as zero) + // and forming the complete address in the second register. This is + // suboptimal. + return C_Memory; } } return TargetLowering::getConstraintType(Constraint); @@ -6088,6 +7032,9 @@ PPCTargetLowering::getSingleConstraintMatchWeight( case 'y': weight = CW_Register; break; + case 'Z': + weight = CW_Memory; + break; } return weight; } @@ -6099,14 +7046,17 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, // GCC RS6000 Constraint Letters switch (Constraint[0]) { case 'b': // R1-R31 + if (VT == MVT::i64 && PPCSubTarget.isPPC64()) + return std::make_pair(0U, &PPC::G8RC_NOX0RegClass); + return std::make_pair(0U, &PPC::GPRC_NOR0RegClass); case 'r': // R0-R31 if (VT == MVT::i64 && PPCSubTarget.isPPC64()) return std::make_pair(0U, &PPC::G8RCRegClass); return std::make_pair(0U, &PPC::GPRCRegClass); case 'f': - if (VT == MVT::f32) + if (VT == MVT::f32 || VT == MVT::i32) return std::make_pair(0U, &PPC::F4RCRegClass); - if (VT == MVT::f64) + if (VT == MVT::f64 || VT == MVT::i64) return std::make_pair(0U, &PPC::F8RCRegClass); break; case 'v': @@ -6283,12 +7233,16 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op, MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); MFI->setFrameAddressIsTaken(true); - bool is31 = (getTargetMachine().Options.DisableFramePointerElim(MF) || - MFI->hasVarSizedObjects()) && - MFI->getStackSize() && - !MF.getFunction()->getFnAttributes().hasNakedAttr(); - unsigned FrameReg = isPPC64 ? (is31 ? PPC::X31 : PPC::X1) : - (is31 ? PPC::R31 : PPC::R1); + + // Naked functions never have a frame pointer, and so we use r1. For all + // other functions, this decision must be delayed until during PEI. + unsigned FrameReg; + if (MF.getFunction()->getAttributes().hasAttribute( + AttributeSet::FunctionIndex, Attribute::Naked)) + FrameReg = isPPC64 ? PPC::X1 : PPC::R1; + else + FrameReg = isPPC64 ? PPC::FP8 : PPC::FP; + SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT); while (Depth--) @@ -6309,16 +7263,15 @@ PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { /// lowering. If DstAlign is zero that means it's safe to destination /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it /// means there isn't a need to check it against alignment requirement, -/// probably because the source does not need to be loaded. If -/// 'IsZeroVal' is true, that means it's safe to return a -/// non-scalar-integer type, e.g. empty string source, constant, or loaded -/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is -/// constant so it does not need to be loaded. +/// probably because the source does not need to be loaded. If 'IsMemset' is +/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that +/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy +/// source is constant so it does not need to be loaded. /// It returns EVT::Other if the type should be determined using generic /// target-independent logic. EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, - bool IsZeroVal, + bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const { if (this->PPCSubTarget.isPPC64()) { @@ -6328,6 +7281,32 @@ EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, } } +bool PPCTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, + bool *Fast) const { + if (DisablePPCUnaligned) + return false; + + // PowerPC supports unaligned memory access for simple non-vector types. + // Although accessing unaligned addresses is not as efficient as accessing + // aligned addresses, it is generally more efficient than manual expansion, + // and generally only traps for software emulation when crossing page + // boundaries. + + if (!VT.isSimple()) + return false; + + if (VT.getSimpleVT().isVector()) + return false; + + if (VT == MVT::ppcf128) + return false; + + if (Fast) + *Fast = true; + + return true; +} + /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd