X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FPowerPC%2FPPCISelLowering.cpp;h=6eded03313d9620731b9cf626bb51ec5fb824930;hb=0c3b56bdab5730d003ef68ee53c89cd8a0ee6f3d;hp=a0b3e467d209544b22ceb953078308022967b942;hpb=82b1114fef542b98f4744d3d1a7e8562d1c962a5;p=oota-llvm.git diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index a0b3e467d20..6eded03313d 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -19,6 +19,7 @@ #include "PPCTargetObjectFile.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Triple.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -38,6 +39,10 @@ #include "llvm/Target/TargetOptions.h" using namespace llvm; +// FIXME: Remove this once soft-float is supported. +static cl::opt DisablePPCFloatInVariadic("disable-ppc-float-in-variadic", +cl::desc("disable saving float registers for va_start on PPC"), cl::Hidden); + static cl::opt DisablePPCPreinc("disable-ppc-preinc", cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden); @@ -50,21 +55,19 @@ cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden); // FIXME: Remove this once the bug has been fixed! extern cl::opt ANDIGlueBug; -static TargetLoweringObjectFile *CreateTLOF(const PPCTargetMachine &TM) { - if (TM.getSubtargetImpl()->isDarwin()) +static TargetLoweringObjectFile *createTLOF(const Triple &TT) { + // If it isn't a Mach-O file then it's going to be a linux ELF + // object file. + if (TT.isOSDarwin()) return new TargetLoweringObjectFileMachO(); - if (TM.getSubtargetImpl()->isSVR4ABI()) - return new PPC64LinuxTargetObjectFile(); - - return new TargetLoweringObjectFileELF(); + return new PPC64LinuxTargetObjectFile(); } PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) - : TargetLowering(TM, CreateTLOF(TM)), PPCSubTarget(*TM.getSubtargetImpl()) { - const PPCSubtarget *Subtarget = &TM.getSubtarget(); - - setPow2DivIsCheap(); + : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))), + Subtarget(*TM.getSubtargetImpl()) { + setPow2SDivIsCheap(); // Use _setjmp/_longjmp instead of setjmp/longjmp. setUseUnderscoreSetJmp(true); @@ -72,7 +75,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all // arguments are at least 4/8 bytes aligned. - bool isPPC64 = Subtarget->isPPC64(); + bool isPPC64 = Subtarget.isPPC64(); setMinStackArgumentAlignment(isPPC64 ? 8:4); // Set up the register classes. @@ -98,10 +101,10 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal); setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal); - if (Subtarget->useCRBits()) { + if (Subtarget.useCRBits()) { setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); - if (isPPC64 || Subtarget->hasFPCVT()) { + if (isPPC64 || Subtarget.hasFPCVT()) { setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote); AddPromotedToType (ISD::SINT_TO_FP, MVT::i1, isPPC64 ? MVT::i64 : MVT::i32); @@ -176,17 +179,17 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); // If we're enabling GP optimizations, use hardware square root - if (!Subtarget->hasFSQRT() && + if (!Subtarget.hasFSQRT() && !(TM.Options.UnsafeFPMath && - Subtarget->hasFRSQRTE() && Subtarget->hasFRE())) + Subtarget.hasFRSQRTE() && Subtarget.hasFRE())) setOperationAction(ISD::FSQRT, MVT::f64, Expand); - if (!Subtarget->hasFSQRT() && + if (!Subtarget.hasFSQRT() && !(TM.Options.UnsafeFPMath && - Subtarget->hasFRSQRTES() && Subtarget->hasFRES())) + Subtarget.hasFRSQRTES() && Subtarget.hasFRES())) setOperationAction(ISD::FSQRT, MVT::f32, Expand); - if (Subtarget->hasFCPSGN()) { + if (Subtarget.hasFCPSGN()) { setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal); } else { @@ -194,7 +197,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); } - if (Subtarget->hasFPRND()) { + if (Subtarget.hasFPRND()) { setOperationAction(ISD::FFLOOR, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); setOperationAction(ISD::FTRUNC, MVT::f64, Legal); @@ -216,7 +219,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); - if (Subtarget->hasPOPCNTD()) { + if (Subtarget.hasPOPCNTD()) { setOperationAction(ISD::CTPOP, MVT::i32 , Legal); setOperationAction(ISD::CTPOP, MVT::i64 , Legal); } else { @@ -228,7 +231,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::ROTR, MVT::i32 , Expand); setOperationAction(ISD::ROTR, MVT::i64 , Expand); - if (!Subtarget->useCRBits()) { + if (!Subtarget.useCRBits()) { // PowerPC does not have Select setOperationAction(ISD::SELECT, MVT::i32, Expand); setOperationAction(ISD::SELECT, MVT::i64, Expand); @@ -241,11 +244,11 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); // PowerPC wants to optimize integer setcc a bit - if (!Subtarget->useCRBits()) + if (!Subtarget.useCRBits()) setOperationAction(ISD::SETCC, MVT::i32, Custom); // PowerPC does not have BRCOND which requires SetCC - if (!Subtarget->useCRBits()) + if (!Subtarget.useCRBits()) setOperationAction(ISD::BRCOND, MVT::Other, Expand); setOperationAction(ISD::BR_JT, MVT::Other, Expand); @@ -297,7 +300,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); - if (Subtarget->isSVR4ABI()) { + if (Subtarget.isSVR4ABI()) { if (isPPC64) { // VAARG always uses double-word chunks, so promote anything smaller. setOperationAction(ISD::VAARG, MVT::i1, Promote); @@ -317,7 +320,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) } else setOperationAction(ISD::VAARG, MVT::Other, Expand); - if (Subtarget->isSVR4ABI() && !isPPC64) + if (Subtarget.isSVR4ABI() && !isPPC64) // VACOPY is custom lowered with the 32-bit SVR4 ABI. setOperationAction(ISD::VACOPY , MVT::Other, Custom); else @@ -350,7 +353,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setCondCodeAction(ISD::SETONE, MVT::f32, Expand); setCondCodeAction(ISD::SETONE, MVT::f64, Expand); - if (Subtarget->has64BitSupport()) { + if (Subtarget.has64BitSupport()) { // They also have instructions for converting between i64 and fp. setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand); @@ -360,7 +363,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // We cannot do this with Promote because i64 is not a legal type. setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); - if (PPCSubTarget.hasLFIWAX() || Subtarget->isPPC64()) + if (Subtarget.hasLFIWAX() || Subtarget.isPPC64()) setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); } else { // PowerPC does not have FP_TO_UINT on 32-bit implementations. @@ -368,8 +371,8 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) } // With the instructions enabled under FPCVT, we can do everything. - if (PPCSubTarget.hasFPCVT()) { - if (Subtarget->has64BitSupport()) { + if (Subtarget.hasFPCVT()) { + if (Subtarget.has64BitSupport()) { setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); @@ -382,7 +385,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); } - if (Subtarget->use64BitRegs()) { + if (Subtarget.use64BitRegs()) { // 64-bit PowerPC implementations can support i64 types directly addRegisterClass(MVT::i64, &PPC::G8RCRegClass); // BUILD_PAIR can't be handled natively, and should be expanded to shl/or @@ -398,7 +401,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); } - if (Subtarget->hasAltivec()) { + if (Subtarget.hasAltivec()) { // First set operation action for all vector types to expand. Then we // will selectively turn on ones that can be effectively codegen'd. for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; @@ -454,12 +457,15 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); setOperationAction(ISD::BUILD_VECTOR, VT, Expand); + setOperationAction(ISD::MULHU, VT, Expand); + setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); + setOperationAction(ISD::BSWAP, VT, Expand); setOperationAction(ISD::CTPOP, VT, Expand); setOperationAction(ISD::CTLZ, VT, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); @@ -487,7 +493,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::XOR , MVT::v4i32, Legal); setOperationAction(ISD::LOAD , MVT::v4i32, Legal); setOperationAction(ISD::SELECT, MVT::v4i32, - Subtarget->useCRBits() ? Legal : Expand); + Subtarget.useCRBits() ? Legal : Expand); setOperationAction(ISD::STORE , MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); @@ -506,7 +512,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::MUL, MVT::v4f32, Legal); setOperationAction(ISD::FMA, MVT::v4f32, Legal); - if (TM.Options.UnsafeFPMath || Subtarget->hasVSX()) { + if (TM.Options.UnsafeFPMath || Subtarget.hasVSX()) { setOperationAction(ISD::FDIV, MVT::v4f32, Legal); setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); } @@ -526,15 +532,10 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // Altivec does not contain unordered floating-point compare instructions setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand); setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand); - setCondCodeAction(ISD::SETUGT, MVT::v4f32, Expand); - setCondCodeAction(ISD::SETUGE, MVT::v4f32, Expand); - setCondCodeAction(ISD::SETULT, MVT::v4f32, Expand); - setCondCodeAction(ISD::SETULE, MVT::v4f32, Expand); - setCondCodeAction(ISD::SETO, MVT::v4f32, Expand); setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand); - if (Subtarget->hasVSX()) { + if (Subtarget.hasVSX()) { setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); @@ -561,11 +562,6 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // Share the Altivec comparison restrictions. setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand); setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand); - setCondCodeAction(ISD::SETUGT, MVT::v2f64, Expand); - setCondCodeAction(ISD::SETUGE, MVT::v2f64, Expand); - setCondCodeAction(ISD::SETULT, MVT::v2f64, Expand); - setCondCodeAction(ISD::SETULE, MVT::v2f64, Expand); - setCondCodeAction(ISD::SETO, MVT::v2f64, Expand); setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand); @@ -612,7 +608,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) } } - if (Subtarget->has64BitSupport()) { + if (Subtarget.has64BitSupport()) { setOperationAction(ISD::PREFETCH, MVT::Other, Legal); setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); } @@ -626,6 +622,13 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // Altivec instructions set fields to all zeros or all ones. setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); + if (!isPPC64) { + // These libcalls are not available in 32-bit. + setLibcallName(RTLIB::SHL_I128, nullptr); + setLibcallName(RTLIB::SRL_I128, nullptr); + setLibcallName(RTLIB::SRA_I128, nullptr); + } + if (isPPC64) { setStackPointerRegisterToSaveRestore(PPC::X1); setExceptionPointerRegister(PPC::X3); @@ -641,7 +644,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::BR_CC); - if (Subtarget->useCRBits()) + if (Subtarget.useCRBits()) setTargetDAGCombine(ISD::BRCOND); setTargetDAGCombine(ISD::BSWAP); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); @@ -650,7 +653,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); - if (Subtarget->useCRBits()) { + if (Subtarget.useCRBits()) { setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::SELECT_CC); @@ -663,7 +666,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) } // Darwin long double math library functions have $LDBL128 appended. - if (Subtarget->isDarwin()) { + if (Subtarget.isDarwin()) { setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128"); setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128"); setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128"); @@ -678,21 +681,16 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // With 32 condition bits, we don't need to sink (and duplicate) compares // aggressively in CodeGenPrep. - if (Subtarget->useCRBits()) + if (Subtarget.useCRBits()) setHasMultipleConditionRegisters(); setMinFunctionAlignment(2); - if (PPCSubTarget.isDarwin()) + if (Subtarget.isDarwin()) setPrefFunctionAlignment(4); - if (isPPC64 && Subtarget->isJITCodeModel()) - // Temporary workaround for the inability of PPC64 JIT to handle jump - // tables. - setSupportJumpTables(false); - setInsertFencesForAtomic(true); - if (Subtarget->enableMachineScheduler()) + if (Subtarget.enableMachineScheduler()) setSchedulingPreference(Sched::Source); else setSchedulingPreference(Sched::Hybrid); @@ -701,8 +699,8 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // The Freescale cores does better with aggressive inlining of memcpy and // friends. Gcc uses same threshold of 128 bytes (= 32 word stores). - if (Subtarget->getDarwinDirective() == PPC::DIR_E500mc || - Subtarget->getDarwinDirective() == PPC::DIR_E5500) { + if (Subtarget.getDarwinDirective() == PPC::DIR_E500mc || + Subtarget.getDarwinDirective() == PPC::DIR_E5500) { MaxStoresPerMemset = 32; MaxStoresPerMemsetOptSize = 16; MaxStoresPerMemcpy = 32; @@ -746,14 +744,14 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign, /// function arguments in the caller parameter area. unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty) const { // Darwin passes everything on 4 byte boundary. - if (PPCSubTarget.isDarwin()) + if (Subtarget.isDarwin()) return 4; // 16byte and wider vectors are passed on 16byte boundary. // The rest is 8 on PPC64 and 4 on PPC32 boundary. - unsigned Align = PPCSubTarget.isPPC64() ? 8 : 4; - if (PPCSubTarget.hasAltivec() || PPCSubTarget.hasQPX()) - getMaxByValAlign(Ty, Align, PPCSubTarget.hasQPX() ? 32 : 16); + unsigned Align = Subtarget.isPPC64() ? 8 : 4; + if (Subtarget.hasAltivec() || Subtarget.hasQPX()) + getMaxByValAlign(Ty, Align, Subtarget.hasQPX() ? 32 : 16); return Align; } @@ -773,7 +771,6 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::Hi: return "PPCISD::Hi"; case PPCISD::Lo: return "PPCISD::Lo"; case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY"; - case PPCISD::TOC_RESTORE: return "PPCISD::TOC_RESTORE"; case PPCISD::LOAD: return "PPCISD::LOAD"; case PPCISD::LOAD_TOC: return "PPCISD::LOAD_TOC"; case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC"; @@ -825,7 +822,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { EVT PPCTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { if (!VT.isVector()) - return PPCSubTarget.useCRBits() ? MVT::i1 : MVT::i32; + return Subtarget.useCRBits() ? MVT::i1 : MVT::i32; return VT.changeVectorElementTypeToInteger(); } @@ -854,15 +851,30 @@ static bool isConstantOrUndef(int Op, int Val) { /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a /// VPKUHUM instruction. -bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary) { - if (!isUnary) { +/// The ShuffleKind distinguishes between big-endian operations with +/// two different inputs (0), either-endian operations with two identical +/// inputs (1), and little-endian operantion with two different inputs (2). +/// For the latter, the input operands are swapped (see PPCInstrAltivec.td). +bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, + SelectionDAG &DAG) { + bool IsLE = DAG.getSubtarget().getDataLayout()->isLittleEndian(); + if (ShuffleKind == 0) { + if (IsLE) + return false; for (unsigned i = 0; i != 16; ++i) - if (!isConstantOrUndef(N->getMaskElt(i), i*2+1)) + if (!isConstantOrUndef(N->getMaskElt(i), i*2+1)) return false; - } else { + } else if (ShuffleKind == 2) { + if (!IsLE) + return false; + for (unsigned i = 0; i != 16; ++i) + if (!isConstantOrUndef(N->getMaskElt(i), i*2)) + return false; + } else if (ShuffleKind == 1) { + unsigned j = IsLE ? 0 : 1; for (unsigned i = 0; i != 8; ++i) - if (!isConstantOrUndef(N->getMaskElt(i), i*2+1) || - !isConstantOrUndef(N->getMaskElt(i+8), i*2+1)) + if (!isConstantOrUndef(N->getMaskElt(i), i*2+j) || + !isConstantOrUndef(N->getMaskElt(i+8), i*2+j)) return false; } return true; @@ -870,18 +882,34 @@ bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary) { /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a /// VPKUWUM instruction. -bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary) { - if (!isUnary) { +/// The ShuffleKind distinguishes between big-endian operations with +/// two different inputs (0), either-endian operations with two identical +/// inputs (1), and little-endian operantion with two different inputs (2). +/// For the latter, the input operands are swapped (see PPCInstrAltivec.td). +bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, + SelectionDAG &DAG) { + bool IsLE = DAG.getSubtarget().getDataLayout()->isLittleEndian(); + if (ShuffleKind == 0) { + if (IsLE) + return false; for (unsigned i = 0; i != 16; i += 2) if (!isConstantOrUndef(N->getMaskElt(i ), i*2+2) || !isConstantOrUndef(N->getMaskElt(i+1), i*2+3)) return false; - } else { + } else if (ShuffleKind == 2) { + if (!IsLE) + return false; + for (unsigned i = 0; i != 16; i += 2) + if (!isConstantOrUndef(N->getMaskElt(i ), i*2) || + !isConstantOrUndef(N->getMaskElt(i+1), i*2+1)) + return false; + } else if (ShuffleKind == 1) { + unsigned j = IsLE ? 0 : 2; for (unsigned i = 0; i != 8; i += 2) - if (!isConstantOrUndef(N->getMaskElt(i ), i*2+2) || - !isConstantOrUndef(N->getMaskElt(i+1), i*2+3) || - !isConstantOrUndef(N->getMaskElt(i+8), i*2+2) || - !isConstantOrUndef(N->getMaskElt(i+9), i*2+3)) + if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) || + !isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) || + !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) || + !isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1)) return false; } return true; @@ -908,27 +936,64 @@ static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize, } /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for -/// a VRGL* instruction with the specified unit size (1,2 or 4 bytes). +/// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes). +/// The ShuffleKind distinguishes between big-endian merges with two +/// different inputs (0), either-endian merges with two identical inputs (1), +/// and little-endian merges with two different inputs (2). For the latter, +/// the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, - bool isUnary) { - if (!isUnary) - return isVMerge(N, UnitSize, 8, 24); - return isVMerge(N, UnitSize, 8, 8); + unsigned ShuffleKind, SelectionDAG &DAG) { + if (DAG.getSubtarget().getDataLayout()->isLittleEndian()) { + if (ShuffleKind == 1) // unary + return isVMerge(N, UnitSize, 0, 0); + else if (ShuffleKind == 2) // swapped + return isVMerge(N, UnitSize, 0, 16); + else + return false; + } else { + if (ShuffleKind == 1) // unary + return isVMerge(N, UnitSize, 8, 8); + else if (ShuffleKind == 0) // normal + return isVMerge(N, UnitSize, 8, 24); + else + return false; + } } /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for -/// a VRGH* instruction with the specified unit size (1,2 or 4 bytes). +/// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes). +/// The ShuffleKind distinguishes between big-endian merges with two +/// different inputs (0), either-endian merges with two identical inputs (1), +/// and little-endian merges with two different inputs (2). For the latter, +/// the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, - bool isUnary) { - if (!isUnary) - return isVMerge(N, UnitSize, 0, 16); - return isVMerge(N, UnitSize, 0, 0); + unsigned ShuffleKind, SelectionDAG &DAG) { + if (DAG.getSubtarget().getDataLayout()->isLittleEndian()) { + if (ShuffleKind == 1) // unary + return isVMerge(N, UnitSize, 8, 8); + else if (ShuffleKind == 2) // swapped + return isVMerge(N, UnitSize, 8, 24); + else + return false; + } else { + if (ShuffleKind == 1) // unary + return isVMerge(N, UnitSize, 0, 0); + else if (ShuffleKind == 0) // normal + return isVMerge(N, UnitSize, 0, 16); + else + return false; + } } /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift /// amount, otherwise return -1. -int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary) { +/// The ShuffleKind distinguishes between big-endian operations with two +/// different inputs (0), either-endian operations with two identical inputs +/// (1), and little-endian operations with two different inputs (2). For the +/// latter, the input operands are swapped (see PPCInstrAltivec.td). +int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind, + SelectionDAG &DAG) { if (N->getValueType(0) != MVT::v16i8) return -1; @@ -945,19 +1010,27 @@ int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary) { // numbered from this value. unsigned ShiftAmt = SVOp->getMaskElt(i); if (ShiftAmt < i) return -1; + ShiftAmt -= i; + bool isLE = DAG.getTarget().getSubtargetImpl()->getDataLayout()-> + isLittleEndian(); - if (!isUnary) { + if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) { // Check the rest of the elements to see if they are consecutive. for (++i; i != 16; ++i) if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i)) return -1; - } else { + } else if (ShuffleKind == 1) { // Check the rest of the elements to see if they are consecutive. for (++i; i != 16; ++i) if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15)) return -1; - } + } else + return -1; + + if (ShuffleKind == 2 && isLE) + ShiftAmt = 16 - ShiftAmt; + return ShiftAmt; } @@ -1009,10 +1082,14 @@ bool PPC::isAllNegativeZeroVector(SDNode *N) { /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the /// specified isSplatShuffleMask VECTOR_SHUFFLE mask. -unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize) { +unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize, + SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast(N); assert(isSplatShuffleMask(SVOp, EltSize)); - return SVOp->getMaskElt(0) / EltSize; + if (DAG.getSubtarget().getDataLayout()->isLittleEndian()) + return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize); + else + return SVOp->getMaskElt(0) / EltSize; } /// get_VSPLTI_elt - If this is a build_vector of constants which can be formed @@ -1136,7 +1213,7 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { /// sign extension from a 16-bit value. If so, this returns true and the /// immediate. static bool isIntS16Immediate(SDNode *N, short &Imm) { - if (N->getOpcode() != ISD::Constant) + if (!isa(N)) return false; Imm = (short)cast(N)->getZExtValue(); @@ -1285,7 +1362,13 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) { // If all of the bits are known zero on the LHS or RHS, the add won't // carry. - Base = N.getOperand(0); + if (FrameIndexSDNode *FI = + dyn_cast(N.getOperand(0))) { + Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); + fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); + } else { + Base = N.getOperand(0); + } Disp = DAG.getTargetConstant(imm, N.getValueType()); return true; } @@ -1298,7 +1381,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, short Imm; if (isIntS16Immediate(CN, Imm) && (!Aligned || (Imm & 3) == 0)) { Disp = DAG.getTargetConstant(Imm, CN->getValueType(0)); - Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, + Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, CN->getValueType(0)); return true; } @@ -1349,7 +1432,7 @@ bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base, } // Otherwise, do it the hard way, using R0 as the base register. - Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, + Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, N.getValueType()); Index = N; return true; @@ -1445,10 +1528,9 @@ static bool GetLabelAccessInfo(const TargetMachine &TM, unsigned &HiOpFlags, HiOpFlags = PPCII::MO_HA; LoOpFlags = PPCII::MO_LO; - // Don't use the pic base if not in PIC relocation model. Or if we are on a - // non-darwin platform. We don't support PIC on other platforms yet. - bool isPIC = TM.getRelocationModel() == Reloc::PIC_ && - TM.getSubtarget().isDarwin(); + // Don't use the pic base if not in PIC relocation model. + bool isPIC = TM.getRelocationModel() == Reloc::PIC_; + if (isPIC) { HiOpFlags |= PPCII::MO_PIC_FLAG; LoOpFlags |= PPCII::MO_PIC_FLAG; @@ -1496,7 +1578,7 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op, // 64-bit SVR4 ABI code is always position-independent. // The actual address of the GlobalValue is stored in the TOC. - if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) { + if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0); return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(CP), MVT::i64, GA, DAG.getRegister(PPC::X2, MVT::i64)); @@ -1504,6 +1586,15 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op, unsigned MOHiFlag, MOLoFlag; bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag); + + if (isPIC && Subtarget.isSVR4ABI()) { + SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), + PPCII::MO_PIC_FLAG); + SDLoc DL(CP); + return DAG.getNode(PPCISD::TOC_ENTRY, DL, MVT::i32, GA, + DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT)); + } + SDValue CPIHi = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOHiFlag); SDValue CPILo = @@ -1517,7 +1608,7 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { // 64-bit SVR4 ABI code is always position-independent. // The actual address of the GlobalValue is stored in the TOC. - if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) { + if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT); return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(JT), MVT::i64, GA, DAG.getRegister(PPC::X2, MVT::i64)); @@ -1525,6 +1616,15 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { unsigned MOHiFlag, MOLoFlag; bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag); + + if (isPIC && Subtarget.isSVR4ABI()) { + SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, + PPCII::MO_PIC_FLAG); + SDLoc DL(GA); + return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(JT), PtrVT, GA, + DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT)); + } + SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag); SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag); return LowerLabelRef(JTIHi, JTILo, isPIC, DAG); @@ -1554,7 +1654,7 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, SDLoc dl(GA); const GlobalValue *GV = GA->getGlobal(); EVT PtrVT = getPointerTy(); - bool is64bit = PPCSubTarget.isPPC64(); + bool is64bit = Subtarget.isPPC64(); TLSModel::Model Model = getTargetMachine().getTLSModel(GV); @@ -1587,47 +1687,61 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, if (Model == TLSModel::GeneralDynamic) { SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); - SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); - SDValue GOTEntryHi = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT, - GOTReg, TGA); + SDValue GOTPtr; + if (is64bit) { + SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); + GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT, + GOTReg, TGA); + } else { + GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); + } SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSGD_L, dl, PtrVT, - GOTEntryHi, TGA); + GOTPtr, TGA); // We need a chain node, and don't have one handy. The underlying // call has no side effects, so using the function entry node // suffices. SDValue Chain = DAG.getEntryNode(); - Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, GOTEntry); - SDValue ParmReg = DAG.getRegister(PPC::X3, MVT::i64); + Chain = DAG.getCopyToReg(Chain, dl, + is64bit ? PPC::X3 : PPC::R3, GOTEntry); + SDValue ParmReg = DAG.getRegister(is64bit ? PPC::X3 : PPC::R3, + is64bit ? MVT::i64 : MVT::i32); SDValue TLSAddr = DAG.getNode(PPCISD::GET_TLS_ADDR, dl, PtrVT, ParmReg, TGA); // The return value from GET_TLS_ADDR really is in X3 already, but // some hacks are needed here to tie everything together. The extra // copies dissolve during subsequent transforms. - Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, TLSAddr); - return DAG.getCopyFromReg(Chain, dl, PPC::X3, PtrVT); + Chain = DAG.getCopyToReg(Chain, dl, is64bit ? PPC::X3 : PPC::R3, TLSAddr); + return DAG.getCopyFromReg(Chain, dl, is64bit ? PPC::X3 : PPC::R3, PtrVT); } if (Model == TLSModel::LocalDynamic) { SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); - SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); - SDValue GOTEntryHi = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT, - GOTReg, TGA); + SDValue GOTPtr; + if (is64bit) { + SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); + GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT, + GOTReg, TGA); + } else { + GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); + } SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSLD_L, dl, PtrVT, - GOTEntryHi, TGA); + GOTPtr, TGA); // We need a chain node, and don't have one handy. The underlying // call has no side effects, so using the function entry node // suffices. SDValue Chain = DAG.getEntryNode(); - Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, GOTEntry); - SDValue ParmReg = DAG.getRegister(PPC::X3, MVT::i64); + Chain = DAG.getCopyToReg(Chain, dl, + is64bit ? PPC::X3 : PPC::R3, GOTEntry); + SDValue ParmReg = DAG.getRegister(is64bit ? PPC::X3 : PPC::R3, + is64bit ? MVT::i64 : MVT::i32); SDValue TLSAddr = DAG.getNode(PPCISD::GET_TLSLD_ADDR, dl, PtrVT, ParmReg, TGA); // The return value from GET_TLSLD_ADDR really is in X3 already, but // some hacks are needed here to tie everything together. The extra // copies dissolve during subsequent transforms. - Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, TLSAddr); + Chain = DAG.getCopyToReg(Chain, dl, is64bit ? PPC::X3 : PPC::R3, TLSAddr); SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl, PtrVT, Chain, ParmReg, TGA); return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA); @@ -1645,7 +1759,7 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op, // 64-bit SVR4 ABI code is always position-independent. // The actual address of the GlobalValue is stored in the TOC. - if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) { + if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset()); return DAG.getNode(PPCISD::TOC_ENTRY, DL, MVT::i64, GA, DAG.getRegister(PPC::X2, MVT::i64)); @@ -1654,6 +1768,14 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op, unsigned MOHiFlag, MOLoFlag; bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag, GV); + if (isPIC && Subtarget.isSVR4ABI()) { + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, + GSDN->getOffset(), + PPCII::MO_PIC_FLAG); + return DAG.getNode(PPCISD::TOC_ENTRY, DL, MVT::i32, GA, + DAG.getNode(PPCISD::GlobalBaseReg, DL, MVT::i32)); + } + SDValue GAHi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag); SDValue GALo = @@ -1748,7 +1870,7 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG, // gpr_index SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain, VAListPtr, MachinePointerInfo(SV), MVT::i8, - false, false, 0); + false, false, false, 0); InChain = GprIndex.getValue(1); if (VT == MVT::i64) { @@ -1771,7 +1893,7 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG, // fpr SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain, FprPtr, MachinePointerInfo(SV), MVT::i8, - false, false, 0); + false, false, false, 0); InChain = FprIndex.getValue(1); SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, @@ -1890,7 +2012,8 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(Chain) .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol("__trampoline_setup", PtrVT), &Args, 0); + DAG.getExternalSymbol("__trampoline_setup", PtrVT), + std::move(Args), 0); std::pair CallResult = LowerCallTo(CLI); return CallResult.second; @@ -2080,11 +2203,117 @@ static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags, unsigned ArgSize = ArgVT.getStoreSize(); if (Flags.isByVal()) ArgSize = Flags.getByValSize(); - ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + + // Round up to multiples of the pointer size, except for array members, + // which are always packed. + if (!Flags.isInConsecutiveRegs()) + ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; return ArgSize; } +/// CalculateStackSlotAlignment - Calculates the alignment of this argument +/// on the stack. +static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT, + ISD::ArgFlagsTy Flags, + unsigned PtrByteSize) { + unsigned Align = PtrByteSize; + + // Altivec parameters are padded to a 16 byte boundary. + if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || + ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || + ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) + Align = 16; + + // ByVal parameters are aligned as requested. + if (Flags.isByVal()) { + unsigned BVAlign = Flags.getByValAlign(); + if (BVAlign > PtrByteSize) { + if (BVAlign % PtrByteSize != 0) + llvm_unreachable( + "ByVal alignment is not a multiple of the pointer size"); + + Align = BVAlign; + } + } + + // Array members are always packed to their original alignment. + if (Flags.isInConsecutiveRegs()) { + // If the array member was split into multiple registers, the first + // needs to be aligned to the size of the full type. (Except for + // ppcf128, which is only aligned as its f64 components.) + if (Flags.isSplit() && OrigVT != MVT::ppcf128) + Align = OrigVT.getStoreSize(); + else + Align = ArgVT.getStoreSize(); + } + + return Align; +} + +/// CalculateStackSlotUsed - Return whether this argument will use its +/// stack slot (instead of being passed in registers). ArgOffset, +/// AvailableFPRs, and AvailableVRs must hold the current argument +/// position, and will be updated to account for this argument. +static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, + ISD::ArgFlagsTy Flags, + unsigned PtrByteSize, + unsigned LinkageSize, + unsigned ParamAreaSize, + unsigned &ArgOffset, + unsigned &AvailableFPRs, + unsigned &AvailableVRs) { + bool UseMemory = false; + + // Respect alignment of argument on the stack. + unsigned Align = + CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); + ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; + // If there's no space left in the argument save area, we must + // use memory (this check also catches zero-sized arguments). + if (ArgOffset >= LinkageSize + ParamAreaSize) + UseMemory = true; + + // Allocate argument on the stack. + ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); + if (Flags.isInConsecutiveRegsLast()) + ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + // If we overran the argument save area, we must use memory + // (this check catches arguments passed partially in memory) + if (ArgOffset > LinkageSize + ParamAreaSize) + UseMemory = true; + + // However, if the argument is actually passed in an FPR or a VR, + // we don't use memory after all. + if (!Flags.isByVal()) { + if (ArgVT == MVT::f32 || ArgVT == MVT::f64) + if (AvailableFPRs > 0) { + --AvailableFPRs; + return false; + } + if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || + ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || + ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) + if (AvailableVRs > 0) { + --AvailableVRs; + return false; + } + } + + return UseMemory; +} + +/// EnsureStackAlignment - Round stack frame size up from NumBytes to +/// ensure minimum alignment required for target. +static unsigned EnsureStackAlignment(const TargetMachine &Target, + unsigned NumBytes) { + unsigned TargetAlign = + Target.getSubtargetImpl()->getFrameLowering()->getStackAlignment(); + unsigned AlignMask = TargetAlign - 1; + NumBytes = (NumBytes + AlignMask) & ~AlignMask; + return NumBytes; +} + SDValue PPCTargetLowering::LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -2093,8 +2322,8 @@ PPCTargetLowering::LowerFormalArguments(SDValue Chain, SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { - if (PPCSubTarget.isSVR4ABI()) { - if (PPCSubTarget.isPPC64()) + if (Subtarget.isSVR4ABI()) { + if (Subtarget.isPPC64()) return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG, InVals); else @@ -2156,11 +2385,12 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( // Assign locations to all of the incoming arguments. SmallVector ArgLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); // Reserve space for the linkage area on the stack. - CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize); + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(false, false, false); + CCInfo.AllocateStack(LinkageSize, PtrByteSize); CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4); @@ -2183,7 +2413,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( RC = &PPC::F4RCRegClass; break; case MVT::f64: - if (PPCSubTarget.hasVSX()) + if (Subtarget.hasVSX()) RC = &PPC::VSFRCRegClass; else RC = &PPC::F8RCRegClass; @@ -2230,7 +2460,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( // caller's stack frame, right above the parameter list area. SmallVector ByValArgLocs; CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ByValArgLocs, *DAG.getContext()); + ByValArgLocs, *DAG.getContext()); // Reserve stack space for the allocations in CCInfo. CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); @@ -2239,23 +2469,14 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( // Area that is at least reserved in the caller of this function. unsigned MinReservedArea = CCByValInfo.getNextStackOffset(); + MinReservedArea = std::max(MinReservedArea, LinkageSize); // Set the size that is at least reserved in caller of this function. Tail // call optimized function's reserved stack space needs to be aligned so that // taking the difference between two stack areas will result in an aligned // stack. - PPCFunctionInfo *FI = MF.getInfo(); - - MinReservedArea = - std::max(MinReservedArea, - PPCFrameLowering::getMinCallFrameSize(false, false)); - - unsigned TargetAlign = DAG.getMachineFunction().getTarget().getFrameLowering()-> - getStackAlignment(); - unsigned AlignMask = TargetAlign-1; - MinReservedArea = (MinReservedArea + AlignMask) & ~AlignMask; - - FI->setMinReservedArea(MinReservedArea); + MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea); + FuncInfo->setMinReservedArea(MinReservedArea); SmallVector MemOps; @@ -2272,7 +2493,9 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, PPC::F8 }; - const unsigned NumFPArgRegs = array_lengthof(FPArgRegs); + unsigned NumFPArgRegs = array_lengthof(FPArgRegs); + if (DisablePPCFloatInVariadic) + NumFPArgRegs = 0; FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs, NumGPArgRegs)); @@ -2281,7 +2504,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( // Make room for NumGPArgRegs and NumFPArgRegs. int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 + - NumFPArgRegs * EVT(MVT::f64).getSizeInBits()/8; + NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8; FuncInfo->setVarArgsStackOffset( MFI->CreateFixedObject(PtrVT.getSizeInBits()/8, @@ -2323,7 +2546,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( MachinePointerInfo(), false, false, 0); MemOps.push_back(Store); // Increment the address by eight for the next argument to store - SDValue PtrOff = DAG.getConstant(EVT(MVT::f64).getSizeInBits()/8, + SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8, PtrVT); FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); } @@ -2351,32 +2574,6 @@ PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT, return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal); } -// Set the size that is at least reserved in caller of this function. Tail -// call optimized functions' reserved stack space needs to be aligned so that -// taking the difference between two stack areas will result in an aligned -// stack. -void -PPCTargetLowering::setMinReservedArea(MachineFunction &MF, SelectionDAG &DAG, - unsigned nAltivecParamsAtEnd, - unsigned MinReservedArea, - bool isPPC64) const { - PPCFunctionInfo *FI = MF.getInfo(); - // Add the Altivec parameters at the end, if needed. - if (nAltivecParamsAtEnd) { - MinReservedArea = ((MinReservedArea+15)/16)*16; - MinReservedArea += 16*nAltivecParamsAtEnd; - } - MinReservedArea = - std::max(MinReservedArea, - PPCFrameLowering::getMinCallFrameSize(isPPC64, true)); - unsigned TargetAlign - = DAG.getMachineFunction().getTarget().getFrameLowering()-> - getStackAlignment(); - unsigned AlignMask = TargetAlign-1; - MinReservedArea = (MinReservedArea + AlignMask) & ~AlignMask; - FI->setMinReservedArea(MinReservedArea); -} - SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( SDValue Chain, @@ -2387,6 +2584,8 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( SmallVectorImpl &InVals) const { // TODO: add description of PPC stack frame format, or at least some docs. // + bool isELFv2ABI = Subtarget.isELFv2ABI(); + bool isLittleEndian = Subtarget.isLittleEndian(); MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); PPCFunctionInfo *FuncInfo = MF.getInfo(); @@ -2397,9 +2596,8 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( (CallConv == CallingConv::Fast)); unsigned PtrByteSize = 8; - unsigned ArgOffset = PPCFrameLowering::getLinkageSize(true, true); - // Area that is at least reserved in caller of this function. - unsigned MinReservedArea = ArgOffset; + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false, + isELFv2ABI); static const MCPhysReg GPR[] = { PPC::X3, PPC::X4, PPC::X5, PPC::X6, @@ -2421,44 +2619,52 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( const unsigned Num_FPR_Regs = 13; const unsigned Num_VR_Regs = array_lengthof(VR); - unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; + // Do a first pass over the arguments to determine whether the ABI + // guarantees that our caller has allocated the parameter save area + // on its stack frame. In the ELFv1 ABI, this is always the case; + // in the ELFv2 ABI, it is true if this is a vararg function or if + // any parameter is located in a stack slot. + + bool HasParameterArea = !isELFv2ABI || isVarArg; + unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize; + unsigned NumBytes = LinkageSize; + unsigned AvailableFPRs = Num_FPR_Regs; + unsigned AvailableVRs = Num_VR_Regs; + for (unsigned i = 0, e = Ins.size(); i != e; ++i) + if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags, + PtrByteSize, LinkageSize, ParamAreaSize, + NumBytes, AvailableFPRs, AvailableVRs)) + HasParameterArea = true; // Add DAG nodes to load the arguments or copy them out of registers. On // entry to a function on PPC, the arguments start after the linkage area, // although the first ones are often in registers. + unsigned ArgOffset = LinkageSize; + unsigned GPR_idx, FPR_idx = 0, VR_idx = 0; SmallVector MemOps; - unsigned nAltivecParamsAtEnd = 0; Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin(); unsigned CurArgIdx = 0; for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { SDValue ArgVal; bool needsLoad = false; EVT ObjectVT = Ins[ArgNo].VT; + EVT OrigVT = Ins[ArgNo].ArgVT; unsigned ObjSize = ObjectVT.getStoreSize(); unsigned ArgSize = ObjSize; ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; std::advance(FuncArg, Ins[ArgNo].OrigArgIndex - CurArgIdx); CurArgIdx = Ins[ArgNo].OrigArgIndex; + /* Respect alignment of argument on the stack. */ + unsigned Align = + CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize); + ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; unsigned CurArgOffset = ArgOffset; - // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary. - if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 || - ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8 || - ObjectVT==MVT::v2f64 || ObjectVT==MVT::v2i64) { - if (isVarArg) { - MinReservedArea = ((MinReservedArea+15)/16)*16; - MinReservedArea += CalculateStackSlotSize(ObjectVT, - Flags, - PtrByteSize); - } else - nAltivecParamsAtEnd++; - } else - // Calculate min reserved area. - MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT, - Flags, - PtrByteSize); + /* Compute GPR index associated with argument offset. */ + GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; + GPR_idx = std::min(GPR_idx, Num_GPR_Regs); // FIXME the codegen can be much improved in some cases. // We do not have to keep everything in memory. @@ -2480,21 +2686,31 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( continue; } - unsigned BVAlign = Flags.getByValAlign(); - if (BVAlign > 8) { - ArgOffset = ((ArgOffset+BVAlign-1)/BVAlign)*BVAlign; - CurArgOffset = ArgOffset; - } - - // All aggregates smaller than 8 bytes must be passed right-justified. - if (ObjSize < PtrByteSize) - CurArgOffset = CurArgOffset + (PtrByteSize - ObjSize); - // The value of the object is its address. - int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true); + // Create a stack object covering all stack doublewords occupied + // by the argument. If the argument is (fully or partially) on + // the stack, or if the argument is fully in registers but the + // caller has allocated the parameter save anyway, we can refer + // directly to the caller's stack frame. Otherwise, create a + // local copy in our own frame. + int FI; + if (HasParameterArea || + ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize) + FI = MFI->CreateFixedObject(ArgSize, ArgOffset, false, true); + else + FI = MFI->CreateStackObject(ArgSize, Align, false); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - InVals.push_back(FIN); - if (ObjSize < 8) { + // Handle aggregates smaller than 8 bytes. + if (ObjSize < PtrByteSize) { + // The value of the object is its address, which differs from the + // address of the enclosing doubleword on big-endian systems. + SDValue Arg = FIN; + if (!isLittleEndian) { + SDValue ArgOff = DAG.getConstant(PtrByteSize - ObjSize, PtrVT); + Arg = DAG.getNode(ISD::ADD, dl, ArgOff.getValueType(), Arg, ArgOff); + } + InVals.push_back(Arg); + if (GPR_idx != Num_GPR_Regs) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); @@ -2503,25 +2719,19 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( if (ObjSize==1 || ObjSize==2 || ObjSize==4) { EVT ObjType = (ObjSize == 1 ? MVT::i8 : (ObjSize == 2 ? MVT::i16 : MVT::i32)); - Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN, + Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg, MachinePointerInfo(FuncArg), ObjType, false, false, 0); } else { // For sizes that don't fit a truncating store (3, 5, 6, 7), // store the whole register as-is to the parameter save area - // slot. The address of the parameter was already calculated - // above (InVals.push_back(FIN)) to be the right-justified - // offset within the slot. For this store, we need a new - // frame index that points at the beginning of the slot. - int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true); - SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + // slot. Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo(FuncArg), false, false, 0); } MemOps.push_back(Store); - ++GPR_idx; } // Whether we copied from a register or not, advance the offset // into the parameter save area by a full doubleword. @@ -2529,27 +2739,29 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( continue; } + // The value of the object is its address, which is the address of + // its first stack doubleword. + InVals.push_back(FIN); + + // Store whatever pieces of the object are in registers to memory. for (unsigned j = 0; j < ArgSize; j += PtrByteSize) { - // Store whatever pieces of the object are in registers - // to memory. ArgOffset will be the address of the beginning - // of the object. - if (GPR_idx != Num_GPR_Regs) { - unsigned VReg; - VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); - int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true); - SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); - SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo(FuncArg, j), - false, false, 0); - MemOps.push_back(Store); - ++GPR_idx; - ArgOffset += PtrByteSize; - } else { - ArgOffset += ArgSize - j; + if (GPR_idx == Num_GPR_Regs) break; + + unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); + SDValue Addr = FIN; + if (j) { + SDValue Off = DAG.getConstant(j, PtrVT); + Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off); } + SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr, + MachinePointerInfo(FuncArg, j), + false, false, 0); + MemOps.push_back(Store); + ++GPR_idx; } + ArgOffset += ArgSize; continue; } @@ -2558,6 +2770,9 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( case MVT::i1: case MVT::i32: case MVT::i64: + // These can be scalar arguments or elements of an integer array type + // passed directly. Clang may use those instead of "byval" aggregate + // types to avoid forcing arguments to memory unnecessarily. if (GPR_idx != Num_GPR_Regs) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); @@ -2566,8 +2781,6 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( // PPC64 passes i8, i16, and i32 values in i64 registers. Promote // value to MVT::i64 and then truncate to the correct register size. ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); - - ++GPR_idx; } else { needsLoad = true; ArgSize = PtrByteSize; @@ -2577,29 +2790,47 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( case MVT::f32: case MVT::f64: - // Every 8 bytes of argument space consumes one of the GPRs available for - // argument passing. - if (GPR_idx != Num_GPR_Regs) { - ++GPR_idx; - } + // These can be scalar arguments or elements of a float array type + // passed directly. The latter are used to implement ELFv2 homogenous + // float aggregates. if (FPR_idx != Num_FPR_Regs) { unsigned VReg; if (ObjectVT == MVT::f32) VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass); else - VReg = MF.addLiveIn(FPR[FPR_idx], PPCSubTarget.hasVSX() ? + VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX() ? &PPC::VSFRCRegClass : &PPC::F8RCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); ++FPR_idx; + } else if (GPR_idx != Num_GPR_Regs) { + // This can only ever happen in the presence of f32 array types, + // since otherwise we never run out of FPRs before running out + // of GPRs. + unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); + + if (ObjectVT == MVT::f32) { + if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0)) + ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal, + DAG.getConstant(32, MVT::i32)); + ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal); + } + + ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal); } else { needsLoad = true; - ArgSize = PtrByteSize; } - ArgOffset += 8; + // When passing an array of floats, the array occupies consecutive + // space in the argument area; only round up to the next doubleword + // at the end of the array. Otherwise, each float takes 8 bytes. + ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize; + ArgOffset += ArgSize; + if (Flags.isInConsecutiveRegsLast()) + ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; break; case MVT::v4f32: case MVT::v4i32: @@ -2607,39 +2838,28 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( case MVT::v16i8: case MVT::v2f64: case MVT::v2i64: - // Note that vector arguments in registers don't reserve stack space, - // except in varargs functions. + // These can be scalar arguments or elements of a vector array type + // passed directly. The latter are used to implement ELFv2 homogenous + // vector aggregates. if (VR_idx != Num_VR_Regs) { unsigned VReg = (ObjectVT == MVT::v2f64 || ObjectVT == MVT::v2i64) ? MF.addLiveIn(VSRH[VR_idx], &PPC::VSHRCRegClass) : MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); - if (isVarArg) { - while ((ArgOffset % 16) != 0) { - ArgOffset += PtrByteSize; - if (GPR_idx != Num_GPR_Regs) - GPR_idx++; - } - ArgOffset += 16; - GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64? - } ++VR_idx; } else { - // Vectors are aligned. - ArgOffset = ((ArgOffset+15)/16)*16; - CurArgOffset = ArgOffset; - ArgOffset += 16; needsLoad = true; } + ArgOffset += 16; break; } // We need to load the argument to a virtual register if we determined // above that we ran out of physical registers of the appropriate type. if (needsLoad) { - int FI = MFI->CreateFixedObject(ObjSize, - CurArgOffset + (ArgSize - ObjSize), - isImmutable); + if (ObjSize < ArgSize && !isLittleEndian) + CurArgOffset += ArgSize - ObjSize; + int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, isImmutable); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(), false, false, false, 0); @@ -2648,11 +2868,19 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( InVals.push_back(ArgVal); } + // Area that is at least reserved in the caller of this function. + unsigned MinReservedArea; + if (HasParameterArea) + MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize); + else + MinReservedArea = LinkageSize; + // Set the size that is at least reserved in caller of this function. Tail // call optimized functions' reserved stack space needs to be aligned so that // taking the difference between two stack areas will result in an aligned // stack. - setMinReservedArea(MF, DAG, nAltivecParamsAtEnd, MinReservedArea, true); + MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea); + FuncInfo->setMinReservedArea(MinReservedArea); // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. @@ -2666,7 +2894,8 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( // If this function is vararg, store any remaining integer argument regs // to their spots on the stack so that they may be loaded by deferencing the // result of va_next. - for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) { + for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; + GPR_idx < Num_GPR_Regs; ++GPR_idx) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, @@ -2705,7 +2934,9 @@ PPCTargetLowering::LowerFormalArguments_Darwin( (CallConv == CallingConv::Fast)); unsigned PtrByteSize = isPPC64 ? 8 : 4; - unsigned ArgOffset = PPCFrameLowering::getLinkageSize(isPPC64, true); + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true, + false); + unsigned ArgOffset = LinkageSize; // Area that is at least reserved in caller of this function. unsigned MinReservedArea = ArgOffset; @@ -2830,7 +3061,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin( CurArgOffset = CurArgOffset + (4 - ObjSize); } // The value of the object is its address. - int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true); + int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, false, true); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); InVals.push_back(FIN); if (ObjSize==1 || ObjSize==2) { @@ -2996,11 +3227,21 @@ PPCTargetLowering::LowerFormalArguments_Darwin( InVals.push_back(ArgVal); } + // Allow for Altivec parameters at the end, if needed. + if (nAltivecParamsAtEnd) { + MinReservedArea = ((MinReservedArea+15)/16)*16; + MinReservedArea += 16*nAltivecParamsAtEnd; + } + + // Area that is at least reserved in the caller of this function. + MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize); + // Set the size that is at least reserved in caller of this function. Tail // call optimized functions' reserved stack space needs to be aligned so that // taking the difference between two stack areas will result in an aligned // stack. - setMinReservedArea(MF, DAG, nAltivecParamsAtEnd, MinReservedArea, isPPC64); + MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea); + FuncInfo->setMinReservedArea(MinReservedArea); // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. @@ -3039,75 +3280,6 @@ PPCTargetLowering::LowerFormalArguments_Darwin( return Chain; } -/// CalculateParameterAndLinkageAreaSize - Get the size of the parameter plus -/// linkage area for the Darwin ABI, or the 64-bit SVR4 ABI. -static unsigned -CalculateParameterAndLinkageAreaSize(SelectionDAG &DAG, - bool isPPC64, - bool isVarArg, - unsigned CC, - const SmallVectorImpl - &Outs, - const SmallVectorImpl &OutVals, - unsigned &nAltivecParamsAtEnd) { - // Count how many bytes are to be pushed on the stack, including the linkage - // area, and parameter passing area. We start with 24/48 bytes, which is - // prereserved space for [SP][CR][LR][3 x unused]. - unsigned NumBytes = PPCFrameLowering::getLinkageSize(isPPC64, true); - unsigned NumOps = Outs.size(); - unsigned PtrByteSize = isPPC64 ? 8 : 4; - - // Add up all the space actually used. - // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually - // they all go in registers, but we must reserve stack space for them for - // possible use by the caller. In varargs or 64-bit calls, parameters are - // assigned stack space in order, with padding so Altivec parameters are - // 16-byte aligned. - nAltivecParamsAtEnd = 0; - for (unsigned i = 0; i != NumOps; ++i) { - ISD::ArgFlagsTy Flags = Outs[i].Flags; - EVT ArgVT = Outs[i].VT; - // Varargs Altivec parameters are padded to a 16 byte boundary. - if (ArgVT==MVT::v4f32 || ArgVT==MVT::v4i32 || - ArgVT==MVT::v8i16 || ArgVT==MVT::v16i8 || - ArgVT==MVT::v2f64 || ArgVT==MVT::v2i64) { - if (!isVarArg && !isPPC64) { - // Non-varargs Altivec parameters go after all the non-Altivec - // parameters; handle those later so we know how much padding we need. - nAltivecParamsAtEnd++; - continue; - } - // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary. - NumBytes = ((NumBytes+15)/16)*16; - } - NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); - } - - // Allow for Altivec parameters at the end, if needed. - if (nAltivecParamsAtEnd) { - NumBytes = ((NumBytes+15)/16)*16; - NumBytes += 16*nAltivecParamsAtEnd; - } - - // The prolog code of the callee may store up to 8 GPR argument registers to - // the stack, allowing va_start to index over them in memory if its varargs. - // Because we cannot tell if this is needed on the caller side, we have to - // conservatively assume that it is needed. As such, make sure we have at - // least enough stack space for the caller to store the 8 GPRs. - NumBytes = std::max(NumBytes, - PPCFrameLowering::getMinCallFrameSize(isPPC64, true)); - - // Tail call needs the stack to be aligned. - if (CC == CallingConv::Fast && DAG.getTarget().Options.GuaranteedTailCallOpt){ - unsigned TargetAlign = DAG.getMachineFunction().getTarget(). - getFrameLowering()->getStackAlignment(); - unsigned AlignMask = TargetAlign-1; - NumBytes = (NumBytes + AlignMask) & ~AlignMask; - } - - return NumBytes; -} - /// CalculateTailCallSPDiff - Get the amount the stack pointer has to be /// adjusted to accommodate the arguments for the tailcall. static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall, @@ -3279,7 +3451,7 @@ SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG, SDLoc dl) const { if (SPDiff) { // Load the LR and FP stack slot for later adjusting. - EVT VT = PPCSubTarget.isPPC64() ? MVT::i64 : MVT::i32; + EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32; LROpOut = getReturnAddrFrameIndex(DAG); LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo(), false, false, false, 0); @@ -3372,10 +3544,11 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, SDLoc dl, int SPDiff, bool isTailCall, SmallVectorImpl > &RegsToPass, SmallVectorImpl &Ops, std::vector &NodeTys, - const PPCSubtarget &PPCSubTarget) { + const PPCSubtarget &Subtarget) { - bool isPPC64 = PPCSubTarget.isPPC64(); - bool isSVR4ABI = PPCSubTarget.isSVR4ABI(); + bool isPPC64 = Subtarget.isPPC64(); + bool isSVR4ABI = Subtarget.isSVR4ABI(); + bool isELFv2ABI = Subtarget.isELFv2ABI(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); NodeTys.push_back(MVT::Other); // Returns a chain @@ -3384,49 +3557,49 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, unsigned CallOpc = PPCISD::CALL; bool needIndirectCall = true; - if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) { - // If this is an absolute destination address, use the munged value. - Callee = SDValue(Dest, 0); - needIndirectCall = false; - } + if (!isSVR4ABI || !isPPC64) + if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) { + // If this is an absolute destination address, use the munged value. + Callee = SDValue(Dest, 0); + needIndirectCall = false; + } if (GlobalAddressSDNode *G = dyn_cast(Callee)) { - // XXX Work around for http://llvm.org/bugs/show_bug.cgi?id=5201 - // Use indirect calls for ALL functions calls in JIT mode, since the - // far-call stubs may be outside relocation limits for a BL instruction. - if (!DAG.getTarget().getSubtarget().isJITCodeModel()) { - unsigned OpFlags = 0; - if (DAG.getTarget().getRelocationModel() != Reloc::Static && - (PPCSubTarget.getTargetTriple().isMacOSX() && - PPCSubTarget.getTargetTriple().isMacOSXVersionLT(10, 5)) && - (G->getGlobal()->isDeclaration() || - G->getGlobal()->isWeakForLinker())) { - // PC-relative references to external symbols should go through $stub, - // unless we're building with the leopard linker or later, which - // automatically synthesizes these stubs. - OpFlags = PPCII::MO_DARWIN_STUB; - } - - // If the callee is a GlobalAddress/ExternalSymbol node (quite common, - // every direct call is) turn it into a TargetGlobalAddress / - // TargetExternalSymbol node so that legalize doesn't hack it. - Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, - Callee.getValueType(), - 0, OpFlags); - needIndirectCall = false; + unsigned OpFlags = 0; + if ((DAG.getTarget().getRelocationModel() != Reloc::Static && + (Subtarget.getTargetTriple().isMacOSX() && + Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5)) && + (G->getGlobal()->isDeclaration() || + G->getGlobal()->isWeakForLinker())) || + (Subtarget.isTargetELF() && !isPPC64 && + !G->getGlobal()->hasLocalLinkage() && + DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { + // PC-relative references to external symbols should go through $stub, + // unless we're building with the leopard linker or later, which + // automatically synthesizes these stubs. + OpFlags = PPCII::MO_PLT_OR_STUB; } + + // If the callee is a GlobalAddress/ExternalSymbol node (quite common, + // every direct call is) turn it into a TargetGlobalAddress / + // TargetExternalSymbol node so that legalize doesn't hack it. + Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, + Callee.getValueType(), 0, OpFlags); + needIndirectCall = false; } if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { unsigned char OpFlags = 0; - if (DAG.getTarget().getRelocationModel() != Reloc::Static && - (PPCSubTarget.getTargetTriple().isMacOSX() && - PPCSubTarget.getTargetTriple().isMacOSXVersionLT(10, 5))) { + if ((DAG.getTarget().getRelocationModel() != Reloc::Static && + (Subtarget.getTargetTriple().isMacOSX() && + Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5))) || + (Subtarget.isTargetELF() && !isPPC64 && + DAG.getTarget().getRelocationModel() == Reloc::PIC_) ) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. - OpFlags = PPCII::MO_DARWIN_STUB; + OpFlags = PPCII::MO_PLT_OR_STUB; } Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType(), @@ -3439,7 +3612,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, // to do the call, we can't use PPCISD::CALL. SDValue MTCTROps[] = {Chain, Callee, InFlag}; - if (isSVR4ABI && isPPC64) { + if (isSVR4ABI && isPPC64 && !isELFv2ABI) { // Function pointers in the 64-bit SVR4 ABI do not point to the function // entry point, but to the function descriptor (the function entry point // address is part of the function descriptor though). @@ -3496,8 +3669,10 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, // additional register being allocated and an unnecessary move instruction // being generated. VTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue TOCOff = DAG.getIntPtrConstant(8); + SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, TOCOff); SDValue LoadTOCPtr = DAG.getNode(PPCISD::LOAD_TOC, dl, VTs, Chain, - Callee, InFlag); + AddTOC, InFlag); Chain = LoadTOCPtr.getValue(0); InFlag = LoadTOCPtr.getValue(1); @@ -3517,7 +3692,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, CallOpc = PPCISD::BCTRL; Callee.setNode(nullptr); // Add use of X11 (holding environment pointer) - if (isSVR4ABI && isPPC64) + if (isSVR4ABI && isPPC64 && !isELFv2ABI) Ops.push_back(DAG.getRegister(PPC::X11, PtrVT)); // Add CTR register as callee so a bctr can be emitted later. if (isTailCall) @@ -3539,6 +3714,10 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, Ops.push_back(DAG.getRegister(RegsToPass[i].first, RegsToPass[i].second.getValueType())); + // Direct calls in the ELFv2 ABI need the TOC register live into the call. + if (Callee.getNode() && isELFv2ABI) + Ops.push_back(DAG.getRegister(PPC::X2, PtrVT)); + return CallOpc; } @@ -3559,8 +3738,8 @@ PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, SmallVectorImpl &InVals) const { SmallVector RVLocs; - CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC); // Copy all of the result registers out of their specified physreg. @@ -3608,14 +3787,16 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, int SPDiff, unsigned NumBytes, const SmallVectorImpl &Ins, SmallVectorImpl &InVals) const { + + bool isELFv2ABI = Subtarget.isELFv2ABI(); std::vector NodeTys; SmallVector Ops; unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, dl, SPDiff, isTailCall, RegsToPass, Ops, NodeTys, - PPCSubTarget); + Subtarget); // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls - if (isVarArg && PPCSubTarget.isSVR4ABI() && !PPCSubTarget.isPPC64()) + if (isVarArg && Subtarget.isSVR4ABI() && !Subtarget.isPPC64()) Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32)); // When performing tail call optimization the callee pops its arguments off @@ -3626,7 +3807,8 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0; // Add a register mask operand representing the call-preserved registers. - const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const TargetRegisterInfo *TRI = + getTargetMachine().getSubtargetImpl()->getRegisterInfo(); const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -3656,7 +3838,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, // same TOC), the NOP will remain unchanged. bool needsTOCRestore = false; - if (!isTailCall && PPCSubTarget.isSVR4ABI()&& PPCSubTarget.isPPC64()) { + if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64()) { if (CallOpc == PPCISD::BCTRL) { // This is a call through a function pointer. // Restore the caller TOC from the save area into R2. @@ -3681,7 +3863,12 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, if (needsTOCRestore) { SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); - Chain = DAG.getNode(PPCISD::TOC_RESTORE, dl, VTs, Chain, InFlag); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT); + unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(isELFv2ABI); + SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset); + SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff); + Chain = DAG.getNode(PPCISD::LOAD_TOC, dl, VTs, Chain, AddTOC, InFlag); InFlag = Chain.getValue(1); } @@ -3717,8 +3904,8 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); - if (PPCSubTarget.isSVR4ABI()) { - if (PPCSubTarget.isPPC64()) + if (Subtarget.isSVR4ABI()) { + if (Subtarget.isPPC64()) return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg, isTailCall, Outs, OutVals, Ins, dl, DAG, InVals); @@ -3767,11 +3954,12 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee, // Assign locations to all of the outgoing arguments. SmallVector ArgLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); // Reserve space for the linkage area on the stack. - CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize); + CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false, false), + PtrByteSize); if (isVarArg) { // Handle fixed and variable vector arguments differently. @@ -3808,7 +3996,7 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee, // Assign locations to all of the outgoing aggregate by value arguments. SmallVector ByValArgLocs; CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ByValArgLocs, *DAG.getContext()); + ByValArgLocs, *DAG.getContext()); // Reserve stack space for the allocations in CCInfo. CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); @@ -3980,6 +4168,8 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { + bool isELFv2ABI = Subtarget.isELFv2ABI(); + bool isLittleEndian = Subtarget.isLittleEndian(); unsigned NumOps = Outs.size(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); @@ -3996,16 +4186,44 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, CallConv == CallingConv::Fast) MF.getInfo()->setHasFastCall(); - unsigned nAltivecParamsAtEnd = 0; - // Count how many bytes are to be pushed on the stack, including the linkage - // area, and parameter passing area. We start with at least 48 bytes, which - // is reserved space for [SP][CR][LR][3 x unused]. - // NOTE: For PPC64, nAltivecParamsAtEnd always remains zero as a result - // of this call. - unsigned NumBytes = - CalculateParameterAndLinkageAreaSize(DAG, true, isVarArg, CallConv, - Outs, OutVals, nAltivecParamsAtEnd); + // area, and parameter passing area. On ELFv1, the linkage area is 48 bytes + // reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage + // area is 32 bytes reserved space for [SP][CR][LR][TOC]. + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false, + isELFv2ABI); + unsigned NumBytes = LinkageSize; + + // Add up all the space actually used. + for (unsigned i = 0; i != NumOps; ++i) { + ISD::ArgFlagsTy Flags = Outs[i].Flags; + EVT ArgVT = Outs[i].VT; + EVT OrigVT = Outs[i].ArgVT; + + /* Respect alignment of argument on the stack. */ + unsigned Align = + CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); + NumBytes = ((NumBytes + Align - 1) / Align) * Align; + + NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); + if (Flags.isInConsecutiveRegsLast()) + NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + } + + unsigned NumBytesActuallyUsed = NumBytes; + + // The prolog code of the callee may store up to 8 GPR argument registers to + // the stack, allowing va_start to index over them in memory if its varargs. + // Because we cannot tell if this is needed on the caller side, we have to + // conservatively assume that it is needed. As such, make sure we have at + // least enough stack space for the caller to store the 8 GPRs. + // FIXME: On ELFv2, it may be unnecessary to allocate the parameter area. + NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); + + // Tail call needs the stack to be aligned. + if (getTargetMachine().Options.GuaranteedTailCallOpt && + CallConv == CallingConv::Fast) + NumBytes = EnsureStackAlignment(MF.getTarget(), NumBytes); // Calculate by how many bytes the stack has to be adjusted in case of tail // call optimization. @@ -4037,8 +4255,8 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, // memory. Also, if this is a vararg function, floating point operations // must be stored to our stack, and loaded into integer regs as well, if // any integer regs are available for argument passing. - unsigned ArgOffset = PPCFrameLowering::getLinkageSize(true, true); - unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; + unsigned ArgOffset = LinkageSize; + unsigned GPR_idx, FPR_idx = 0, VR_idx = 0; static const MCPhysReg GPR[] = { PPC::X3, PPC::X4, PPC::X5, PPC::X6, @@ -4066,6 +4284,17 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, for (unsigned i = 0; i != NumOps; ++i) { SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; + EVT ArgVT = Outs[i].VT; + EVT OrigVT = Outs[i].ArgVT; + + /* Respect alignment of argument on the stack. */ + unsigned Align = + CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); + ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; + + /* Compute GPR index associated with argument offset. */ + GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; + GPR_idx = std::min(GPR_idx, NumGPRs); // PtrOff will be used to store the current argument to the stack if a // register cannot be found for it. @@ -4098,24 +4327,15 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, if (Size == 0) continue; - unsigned BVAlign = Flags.getByValAlign(); - if (BVAlign > 8) { - if (BVAlign % PtrByteSize != 0) - llvm_unreachable( - "ByVal alignment is not a multiple of the pointer size"); - - ArgOffset = ((ArgOffset+BVAlign-1)/BVAlign)*BVAlign; - } - // All aggregates smaller than 8 bytes must be passed right-justified. if (Size==1 || Size==2 || Size==4) { EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32); if (GPR_idx != NumGPRs) { SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, MachinePointerInfo(), VT, - false, false, 0); + false, false, false, 0); MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load)); ArgOffset += PtrByteSize; continue; @@ -4123,9 +4343,12 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, } if (GPR_idx == NumGPRs && Size < 8) { - SDValue Const = DAG.getConstant(PtrByteSize - Size, - PtrOff.getValueType()); - SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); + SDValue AddPtr = PtrOff; + if (!isLittleEndian) { + SDValue Const = DAG.getConstant(PtrByteSize - Size, + PtrOff.getValueType()); + AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); + } Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, CallSeqStart, Flags, DAG, dl); @@ -4160,8 +4383,11 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, // small aggregates, particularly for packed ones. // FIXME: It would be preferable to use the slot in the // parameter save area instead of a new local variable. - SDValue Const = DAG.getConstant(8 - Size, PtrOff.getValueType()); - SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); + SDValue AddPtr = PtrOff; + if (!isLittleEndian) { + SDValue Const = DAG.getConstant(8 - Size, PtrOff.getValueType()); + AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); + } Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, CallSeqStart, Flags, DAG, dl); @@ -4171,7 +4397,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, MachinePointerInfo(), false, false, false, 0); MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load)); // Done with this argument. ArgOffset += PtrByteSize; @@ -4203,8 +4429,11 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, case MVT::i1: case MVT::i32: case MVT::i64: + // These can be scalar arguments or elements of an integer array type + // passed directly. Clang may use those instead of "byval" aggregate + // types to avoid forcing arguments to memory unnecessarily. if (GPR_idx != NumGPRs) { - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Arg)); } else { LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, true, isTailCall, false, MemOpChains, @@ -4213,40 +4442,70 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, ArgOffset += PtrByteSize; break; case MVT::f32: - case MVT::f64: - if (FPR_idx != NumFPRs) { + case MVT::f64: { + // These can be scalar arguments or elements of a float array type + // passed directly. The latter are used to implement ELFv2 homogenous + // float aggregates. + + // Named arguments go into FPRs first, and once they overflow, the + // remaining arguments go into GPRs and then the parameter save area. + // Unnamed arguments for vararg functions always go to GPRs and + // then the parameter save area. For now, put all arguments to vararg + // routines always in both locations (FPR *and* GPR or stack slot). + bool NeedGPROrStack = isVarArg || FPR_idx == NumFPRs; + + // First load the argument into the next available FPR. + if (FPR_idx != NumFPRs) RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg)); - if (isVarArg) { - // A single float or an aggregate containing only a single float - // must be passed right-justified in the stack doubleword, and - // in the GPR, if one is available. - SDValue StoreOff; - if (Arg.getSimpleValueType().SimpleTy == MVT::f32) { - SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType()); - StoreOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); - } else - StoreOff = PtrOff; - - SDValue Store = DAG.getStore(Chain, dl, Arg, StoreOff, - MachinePointerInfo(), false, false, 0); - MemOpChains.push_back(Store); - - // Float varargs are always shadowed in available integer registers - if (GPR_idx != NumGPRs) { - SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff, - MachinePointerInfo(), false, false, - false, 0); - MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); - } - } else if (GPR_idx != NumGPRs) - // If we have any FPRs remaining, we may also have GPRs remaining. - ++GPR_idx; + // Next, load the argument into GPR or stack slot if needed. + if (!NeedGPROrStack) + ; + else if (GPR_idx != NumGPRs) { + // In the non-vararg case, this can only ever happen in the + // presence of f32 array types, since otherwise we never run + // out of FPRs before running out of GPRs. + SDValue ArgVal; + + // Double values are always passed in a single GPR. + if (Arg.getValueType() != MVT::f32) { + ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); + + // Non-array float values are extended and passed in a GPR. + } else if (!Flags.isInConsecutiveRegs()) { + ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); + ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal); + + // If we have an array of floats, we collect every odd element + // together with its predecessor into one GPR. + } else if (ArgOffset % PtrByteSize != 0) { + SDValue Lo, Hi; + Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]); + Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); + if (!isLittleEndian) + std::swap(Lo, Hi); + ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); + + // The final element, if even, goes into the first half of a GPR. + } else if (Flags.isInConsecutiveRegsLast()) { + ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); + ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal); + if (!isLittleEndian) + ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal, + DAG.getConstant(32, MVT::i32)); + + // Non-final even elements are skipped; they will be handled + // together the with subsequent argument on the next go-around. + } else + ArgVal = SDValue(); + + if (ArgVal.getNode()) + RegsToPass.push_back(std::make_pair(GPR[GPR_idx], ArgVal)); } else { // Single-precision floating-point values are mapped to the // second (rightmost) word of the stack doubleword. - if (Arg.getValueType() == MVT::f32) { + if (Arg.getValueType() == MVT::f32 && + !isLittleEndian && !Flags.isInConsecutiveRegs()) { SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType()); PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); } @@ -4255,29 +4514,32 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, true, isTailCall, false, MemOpChains, TailCallArguments, dl); } - ArgOffset += 8; + // When passing an array of floats, the array occupies consecutive + // space in the argument area; only round up to the next doubleword + // at the end of the array. Otherwise, each float takes 8 bytes. + ArgOffset += (Arg.getValueType() == MVT::f32 && + Flags.isInConsecutiveRegs()) ? 4 : 8; + if (Flags.isInConsecutiveRegsLast()) + ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; break; + } case MVT::v4f32: case MVT::v4i32: case MVT::v8i16: case MVT::v16i8: case MVT::v2f64: case MVT::v2i64: + // These can be scalar arguments or elements of a vector array type + // passed directly. The latter are used to implement ELFv2 homogenous + // vector aggregates. + + // For a varargs call, named arguments go into VRs or on the stack as + // usual; unnamed arguments always go to the stack or the corresponding + // GPRs when within range. For now, we always put the value in both + // locations (or even all three). if (isVarArg) { - // These go aligned on the stack, or in the corresponding R registers - // when within range. The Darwin PPC ABI doc claims they also go in - // V registers; in fact gcc does this only for arguments that are - // prototyped, not for those that match the ... We do it for all - // arguments, seems to work. - while (ArgOffset % 16 !=0) { - ArgOffset += PtrByteSize; - if (GPR_idx != NumGPRs) - GPR_idx++; - } // We could elide this store in the case where the object fits // entirely in R registers. Maybe later. - PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, - DAG.getConstant(ArgOffset, PtrVT)); SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo(), false, false, 0); MemOpChains.push_back(Store); @@ -4308,10 +4570,8 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, break; } - // Non-varargs Altivec params generally go in registers, but have - // stack space allocated at the end. + // Non-varargs Altivec params go into VRs or on the stack. if (VR_idx != NumVRs) { - // Doesn't have GPR space allocated. unsigned VReg = (Arg.getSimpleValueType() == MVT::v2f64 || Arg.getSimpleValueType() == MVT::v2i64) ? VSRH[VR_idx] : VR[VR_idx]; @@ -4322,12 +4582,15 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, true, isTailCall, true, MemOpChains, TailCallArguments, dl); - ArgOffset += 16; } + ArgOffset += 16; break; } } + assert(NumBytesActuallyUsed == ArgOffset); + (void)NumBytesActuallyUsed; + if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); @@ -4336,19 +4599,20 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, // pointers in the 64-bit SVR4 ABI. if (!isTailCall && !dyn_cast(Callee) && - !dyn_cast(Callee) && - !isBLACompatibleAddress(Callee, DAG)) { + !dyn_cast(Callee)) { // Load r2 into a virtual register and store it to the TOC save area. SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64); // TOC save area offset. - SDValue PtrOff = DAG.getIntPtrConstant(40); + unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(isELFv2ABI); + SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset); SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, MachinePointerInfo(), false, false, 0); - // R12 must contain the address of an indirect callee. This does not - // mean the MTCTR instruction must use R12; it's easier to model this - // as an extra parameter, so do that. - RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee)); + // In the ELFv2 ABI, R12 must contain the address of an indirect callee. + // This does not mean the MTCTR instruction must use R12; it's easier + // to model this as an extra parameter, so do that. + if (isELFv2ABI) + RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee)); } // Build a sequence of copy-to-reg nodes chained together with token chain @@ -4396,15 +4660,56 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, CallConv == CallingConv::Fast) MF.getInfo()->setHasFastCall(); - unsigned nAltivecParamsAtEnd = 0; - // Count how many bytes are to be pushed on the stack, including the linkage // area, and parameter passing area. We start with 24/48 bytes, which is // prereserved space for [SP][CR][LR][3 x unused]. - unsigned NumBytes = - CalculateParameterAndLinkageAreaSize(DAG, isPPC64, isVarArg, CallConv, - Outs, OutVals, - nAltivecParamsAtEnd); + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true, + false); + unsigned NumBytes = LinkageSize; + + // Add up all the space actually used. + // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually + // they all go in registers, but we must reserve stack space for them for + // possible use by the caller. In varargs or 64-bit calls, parameters are + // assigned stack space in order, with padding so Altivec parameters are + // 16-byte aligned. + unsigned nAltivecParamsAtEnd = 0; + for (unsigned i = 0; i != NumOps; ++i) { + ISD::ArgFlagsTy Flags = Outs[i].Flags; + EVT ArgVT = Outs[i].VT; + // Varargs Altivec parameters are padded to a 16 byte boundary. + if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || + ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || + ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) { + if (!isVarArg && !isPPC64) { + // Non-varargs Altivec parameters go after all the non-Altivec + // parameters; handle those later so we know how much padding we need. + nAltivecParamsAtEnd++; + continue; + } + // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary. + NumBytes = ((NumBytes+15)/16)*16; + } + NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); + } + + // Allow for Altivec parameters at the end, if needed. + if (nAltivecParamsAtEnd) { + NumBytes = ((NumBytes+15)/16)*16; + NumBytes += 16*nAltivecParamsAtEnd; + } + + // The prolog code of the callee may store up to 8 GPR argument registers to + // the stack, allowing va_start to index over them in memory if its varargs. + // Because we cannot tell if this is needed on the caller side, we have to + // conservatively assume that it is needed. As such, make sure we have at + // least enough stack space for the caller to store the 8 GPRs. + NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); + + // Tail call needs the stack to be aligned. + if (getTargetMachine().Options.GuaranteedTailCallOpt && + CallConv == CallingConv::Fast) + NumBytes = EnsureStackAlignment(MF.getTarget(), NumBytes); // Calculate by how many bytes the stack has to be adjusted in case of tail // call optimization. @@ -4440,7 +4745,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, // memory. Also, if this is a vararg function, floating point operations // must be stored to our stack, and loaded into integer regs as well, if // any integer regs are available for argument passing. - unsigned ArgOffset = PPCFrameLowering::getLinkageSize(isPPC64, true); + unsigned ArgOffset = LinkageSize; unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; static const MCPhysReg GPR_32[] = { // 32-bit registers. @@ -4498,7 +4803,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, if (GPR_idx != NumGPRs) { SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, MachinePointerInfo(), VT, - false, false, 0); + false, false, false, 0); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); @@ -4727,8 +5032,7 @@ PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv, const SmallVectorImpl &Outs, LLVMContext &Context) const { SmallVector RVLocs; - CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), - RVLocs, Context); + CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC_PPC); } @@ -4740,8 +5044,8 @@ PPCTargetLowering::LowerReturn(SDValue Chain, SDLoc dl, SelectionDAG &DAG) const { SmallVector RVLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_PPC); SDValue Flag; @@ -4817,8 +5121,8 @@ SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG, SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const { MachineFunction &MF = DAG.getMachineFunction(); - bool isPPC64 = PPCSubTarget.isPPC64(); - bool isDarwinABI = PPCSubTarget.isDarwinABI(); + bool isPPC64 = Subtarget.isPPC64(); + bool isDarwinABI = Subtarget.isDarwinABI(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); // Get current frame pointer save index. The users of this index will be @@ -4841,8 +5145,8 @@ PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const { SDValue PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const { MachineFunction &MF = DAG.getMachineFunction(); - bool isPPC64 = PPCSubTarget.isPPC64(); - bool isDarwinABI = PPCSubTarget.isDarwinABI(); + bool isPPC64 = Subtarget.isPPC64(); + bool isDarwinABI = Subtarget.isDarwinABI(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); // Get current frame pointer save index. The users of this index will be @@ -5062,12 +5366,12 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!"); case MVT::i32: Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIWZ : - (PPCSubTarget.hasFPCVT() ? PPCISD::FCTIWUZ : + (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ), dl, MVT::f64, Src); break; case MVT::i64: - assert((Op.getOpcode() == ISD::FP_TO_SINT || PPCSubTarget.hasFPCVT()) && + assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) && "i64 FP_TO_UINT is supported only with FPCVT"); Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ : PPCISD::FCTIDUZ, @@ -5076,8 +5380,8 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, } // Convert the FP value to an int value through memory. - bool i32Stack = Op.getValueType() == MVT::i32 && PPCSubTarget.hasSTFIWX() && - (Op.getOpcode() == ISD::FP_TO_SINT || PPCSubTarget.hasFPCVT()); + bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() && + (Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()); SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64); int FI = cast(FIPtr)->getIndex(); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(FI); @@ -5119,17 +5423,17 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, DAG.getConstantFP(1.0, Op.getValueType()), DAG.getConstantFP(0.0, Op.getValueType())); - assert((Op.getOpcode() == ISD::SINT_TO_FP || PPCSubTarget.hasFPCVT()) && + assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) && "UINT_TO_FP is supported only with FPCVT"); // If we have FCFIDS, then use it when converting to single-precision. // Otherwise, convert to double-precision and then round. - unsigned FCFOp = (PPCSubTarget.hasFPCVT() && Op.getValueType() == MVT::f32) ? + unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS : PPCISD::FCFIDS) : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU : PPCISD::FCFID); - MVT FCFTy = (PPCSubTarget.hasFPCVT() && Op.getValueType() == MVT::f32) ? + MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ? MVT::f32 : MVT::f64; if (Op.getOperand(0).getValueType() == MVT::i64) { @@ -5145,7 +5449,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, // However, if -enable-unsafe-fp-math is in effect, accept double // rounding to avoid the extra overhead. if (Op.getValueType() == MVT::f32 && - !PPCSubTarget.hasFPCVT() && + !Subtarget.hasFPCVT() && !DAG.getTarget().Options.UnsafeFPMath) { // Twiddle input to make sure the low 11 bits are zero. (If this @@ -5183,7 +5487,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, SDValue Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT); SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits); - if (Op.getValueType() == MVT::f32 && !PPCSubTarget.hasFPCVT()) + if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, DAG.getIntPtrConstant(0)); return FP; @@ -5200,7 +5504,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); SDValue Ld; - if (PPCSubTarget.hasLFIWAX() || PPCSubTarget.hasFPCVT()) { + if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) { int FrameIdx = FrameInfo->CreateStackObject(4, 4, false); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); @@ -5219,7 +5523,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, dl, DAG.getVTList(MVT::f64, MVT::Other), Ops, MVT::i32, MMO); } else { - assert(PPCSubTarget.isPPC64() && + assert(Subtarget.isPPC64() && "i32->FP without LFIWAX supported only on PPC64"); int FrameIdx = FrameInfo->CreateStackObject(8, 8, false); @@ -5241,7 +5545,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, // FCFID it and return it. SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld); - if (Op.getValueType() == MVT::f32 && !PPCSubTarget.hasFPCVT()) + if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, DAG.getIntPtrConstant(0)); return FP; } @@ -5530,10 +5834,14 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, // we convert to a pseudo that will be expanded later into one of // the above forms. SDValue Elt = DAG.getConstant(SextVal, MVT::i32); - EVT VT = Op.getValueType(); - int Size = VT == MVT::v16i8 ? 1 : (VT == MVT::v8i16 ? 2 : 4); - SDValue EltSize = DAG.getConstant(Size, MVT::i32); - return DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize); + EVT VT = (SplatSize == 1 ? MVT::v16i8 : + (SplatSize == 2 ? MVT::v8i16 : MVT::v4i32)); + SDValue EltSize = DAG.getConstant(SplatSize, MVT::i32); + SDValue RetVal = DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize); + if (VT == Op.getValueType()) + return RetVal; + else + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), RetVal); } // If this is 0x8000_0000 x 4, turn into vspltisw + vslw. If it is @@ -5552,6 +5860,22 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); } + // The remaining cases assume either big endian element order or + // a splat-size that equates to the element size of the vector + // to be built. An example that doesn't work for little endian is + // {0, -1, 0, -1, 0, -1, 0, -1} which has a splat size of 32 bits + // and a vector element size of 16 bits. The code below will + // produce the vector in big endian element order, which for little + // endian is {-1, 0, -1, 0, -1, 0, -1, 0}. + + // For now, just avoid these optimizations in that case. + // FIXME: Develop correct optimizations for LE with mismatched + // splat and element sizes. + + if (Subtarget.isLittleEndian() && + SplatSize != Op.getValueType().getVectorElementType().getSizeInBits()) + return SDValue(); + // Check to see if this is a wide variety of vsplti*, binop self cases. static const signed char SplatCsts[] = { -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7, @@ -5720,6 +6044,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SDValue V2 = Op.getOperand(1); ShuffleVectorSDNode *SVOp = cast(Op); EVT VT = Op.getValueType(); + bool isLittleEndian = Subtarget.isLittleEndian(); // Cases that are handled by instructions that take permute immediates // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be @@ -5728,15 +6053,15 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, if (PPC::isSplatShuffleMask(SVOp, 1) || PPC::isSplatShuffleMask(SVOp, 2) || PPC::isSplatShuffleMask(SVOp, 4) || - PPC::isVPKUWUMShuffleMask(SVOp, true) || - PPC::isVPKUHUMShuffleMask(SVOp, true) || - PPC::isVSLDOIShuffleMask(SVOp, true) != -1 || - PPC::isVMRGLShuffleMask(SVOp, 1, true) || - PPC::isVMRGLShuffleMask(SVOp, 2, true) || - PPC::isVMRGLShuffleMask(SVOp, 4, true) || - PPC::isVMRGHShuffleMask(SVOp, 1, true) || - PPC::isVMRGHShuffleMask(SVOp, 2, true) || - PPC::isVMRGHShuffleMask(SVOp, 4, true)) { + PPC::isVPKUWUMShuffleMask(SVOp, 1, DAG) || + PPC::isVPKUHUMShuffleMask(SVOp, 1, DAG) || + PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) != -1 || + PPC::isVMRGLShuffleMask(SVOp, 1, 1, DAG) || + PPC::isVMRGLShuffleMask(SVOp, 2, 1, DAG) || + PPC::isVMRGLShuffleMask(SVOp, 4, 1, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 1, 1, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 2, 1, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 4, 1, DAG)) { return Op; } } @@ -5744,15 +6069,16 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, // Altivec has a variety of "shuffle immediates" that take two vector inputs // and produce a fixed permutation. If any of these match, do not lower to // VPERM. - if (PPC::isVPKUWUMShuffleMask(SVOp, false) || - PPC::isVPKUHUMShuffleMask(SVOp, false) || - PPC::isVSLDOIShuffleMask(SVOp, false) != -1 || - PPC::isVMRGLShuffleMask(SVOp, 1, false) || - PPC::isVMRGLShuffleMask(SVOp, 2, false) || - PPC::isVMRGLShuffleMask(SVOp, 4, false) || - PPC::isVMRGHShuffleMask(SVOp, 1, false) || - PPC::isVMRGHShuffleMask(SVOp, 2, false) || - PPC::isVMRGHShuffleMask(SVOp, 4, false)) + unsigned int ShuffleKind = isLittleEndian ? 2 : 0; + if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) || + PPC::isVPKUHUMShuffleMask(SVOp, ShuffleKind, DAG) || + PPC::isVSLDOIShuffleMask(SVOp, ShuffleKind, DAG) != -1 || + PPC::isVMRGLShuffleMask(SVOp, 1, ShuffleKind, DAG) || + PPC::isVMRGLShuffleMask(SVOp, 2, ShuffleKind, DAG) || + PPC::isVMRGLShuffleMask(SVOp, 4, ShuffleKind, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 1, ShuffleKind, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 2, ShuffleKind, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 4, ShuffleKind, DAG)) return Op; // Check to see if this is a shuffle of 4-byte values. If so, we can use our @@ -5786,7 +6112,9 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, // If this shuffle can be expressed as a shuffle of 4-byte elements, use the // perfect shuffle vector to determine if it is cost effective to do this as // discrete instructions, or whether we should use a vperm. - if (isFourElementShuffle) { + // For now, we skip this for little endian until such time as we have a + // little-endian perfect shuffle table. + if (isFourElementShuffle && !isLittleEndian) { // Compute the index in the perfect shuffle table. unsigned PFTableIndex = PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; @@ -5815,6 +6143,11 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except // that it is in input element units, not in bytes. Convert now. + + // For little endian, the order of the input vectors is reversed, and + // the permutation mask is complemented with respect to 31. This is + // necessary to produce proper semantics with the big-endian-biased vperm + // instruction. EVT EltVT = V1.getValueType().getVectorElementType(); unsigned BytesPerElement = EltVT.getSizeInBits()/8; @@ -5823,13 +6156,22 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i]; for (unsigned j = 0; j != BytesPerElement; ++j) - ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j, - MVT::i32)); + if (isLittleEndian) + ResultMask.push_back(DAG.getConstant(31 - (SrcElt*BytesPerElement+j), + MVT::i32)); + else + ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j, + MVT::i32)); } SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8, ResultMask); - return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), V1, V2, VPermMask); + if (isLittleEndian) + return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), + V2, V1, VPermMask); + else + return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), + V1, V2, VPermMask); } /// getAltivecCompareInfo - Given an intrinsic, return false if it is not an @@ -6022,6 +6364,7 @@ SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { LHS, RHS, Zero, DAG, dl); } else if (Op.getValueType() == MVT::v16i8) { SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); + bool isLittleEndian = Subtarget.isLittleEndian(); // Multiply the even 8-bit parts, producing 16-bit sums. SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub, @@ -6033,13 +6376,24 @@ SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { LHS, RHS, DAG, dl, MVT::v8i16); OddParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OddParts); - // Merge the results together. + // Merge the results together. Because vmuleub and vmuloub are + // instructions with a big-endian bias, we must reverse the + // element numbering and reverse the meaning of "odd" and "even" + // when generating little endian code. int Ops[16]; for (unsigned i = 0; i != 8; ++i) { - Ops[i*2 ] = 2*i+1; - Ops[i*2+1] = 2*i+1+16; + if (isLittleEndian) { + Ops[i*2 ] = 2*i; + Ops[i*2+1] = 2*i+16; + } else { + Ops[i*2 ] = 2*i+1; + Ops[i*2+1] = 2*i+1+16; + } } - return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops); + if (isLittleEndian) + return DAG.getVectorShuffle(MVT::v16i8, dl, OddParts, EvenParts, Ops); + else + return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops); } else { llvm_unreachable("Unknown mul to lower!"); } @@ -6059,17 +6413,17 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); case ISD::VASTART: - return LowerVASTART(Op, DAG, PPCSubTarget); + return LowerVASTART(Op, DAG, Subtarget); case ISD::VAARG: - return LowerVAARG(Op, DAG, PPCSubTarget); + return LowerVAARG(Op, DAG, Subtarget); case ISD::VACOPY: - return LowerVACOPY(Op, DAG, PPCSubTarget); + return LowerVACOPY(Op, DAG, Subtarget); - case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG, PPCSubTarget); + case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG, Subtarget); case ISD::DYNAMIC_STACKALLOC: - return LowerDYNAMIC_STACKALLOC(Op, DAG, PPCSubTarget); + return LowerDYNAMIC_STACKALLOC(Op, DAG, Subtarget); case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); @@ -6139,7 +6493,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, EVT VT = N->getValueType(0); if (VT == MVT::i64) { - SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG, PPCSubTarget); + SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG, Subtarget); Results.push_back(NewNode); Results.push_back(NewNode.getValue(1)); @@ -6183,7 +6537,8 @@ MachineBasicBlock * PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, bool is64bit, unsigned BinOpcode) const { // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = + getTargetMachine().getSubtargetImpl()->getInstrInfo(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction *F = BB->getParent(); @@ -6245,12 +6600,13 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI, bool is8bit, // operation unsigned BinOpcode) const { // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = + getTargetMachine().getSubtargetImpl()->getInstrInfo(); // In 64 bit mode we have to use 64 bits for addresses, even though the // lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address // registers without caring whether they're 32 or 64, but here we're // doing actual arithmetic on the addresses. - bool is64bit = PPCSubTarget.isPPC64(); + bool is64bit = Subtarget.isPPC64(); unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; const BasicBlock *LLVM_BB = BB->getBasicBlock(); @@ -6373,7 +6729,8 @@ llvm::MachineBasicBlock* PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = + getTargetMachine().getSubtargetImpl()->getInstrInfo(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -6445,7 +6802,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, unsigned LabelReg = MRI.createVirtualRegister(PtrRC); unsigned BufReg = MI->getOperand(1).getReg(); - if (PPCSubTarget.isPPC64() && PPCSubTarget.isSVR4ABI()) { + if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) { MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD)) .addReg(PPC::X2) .addImm(TOCOffset) @@ -6458,12 +6815,12 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, unsigned BaseReg; if (MF->getFunction()->getAttributes().hasAttribute( AttributeSet::FunctionIndex, Attribute::Naked)) - BaseReg = PPCSubTarget.isPPC64() ? PPC::X1 : PPC::R1; + BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1; else - BaseReg = PPCSubTarget.isPPC64() ? PPC::BP8 : PPC::BP; + BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP; MIB = BuildMI(*thisMBB, MI, DL, - TII->get(PPCSubTarget.isPPC64() ? PPC::STD : PPC::STW)) + TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW)) .addReg(BaseReg) .addImm(BPOffset) .addReg(BufReg); @@ -6472,7 +6829,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, // Setup MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB); const PPCRegisterInfo *TRI = - static_cast(getTargetMachine().getRegisterInfo()); + getTargetMachine().getSubtarget().getRegisterInfo(); MIB.addRegMask(TRI->getNoPreservedMask()); BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1); @@ -6487,10 +6844,10 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, // mainMBB: // mainDstReg = 0 MIB = BuildMI(mainMBB, DL, - TII->get(PPCSubTarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg); + TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg); // Store IP - if (PPCSubTarget.isPPC64()) { + if (Subtarget.isPPC64()) { MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD)) .addReg(LabelReg) .addImm(LabelOffset) @@ -6521,7 +6878,8 @@ MachineBasicBlock * PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = + getTargetMachine().getSubtargetImpl()->getInstrInfo(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -6540,7 +6898,10 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, // Since FP is only updated here but NOT referenced, it's treated as GPR. unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31; unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1; - unsigned BP = (PVT == MVT::i64) ? PPC::X30 : PPC::R30; + unsigned BP = (PVT == MVT::i64) ? PPC::X30 : + (Subtarget.isSVR4ABI() && + MF->getTarget().getRelocationModel() == Reloc::PIC_ ? + PPC::R29 : PPC::R30); MachineInstrBuilder MIB; @@ -6602,7 +6963,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, MIB.setMemRefs(MMOBegin, MMOEnd); // Reload TOC - if (PVT == MVT::i64 && PPCSubTarget.isSVR4ABI()) { + if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) { MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2) .addImm(TOCOffset) .addReg(BufReg); @@ -6630,7 +6991,8 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, return emitEHSjLjLongJmp(MI, BB); } - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = + getTargetMachine().getSubtargetImpl()->getInstrInfo(); // To "insert" these instructions we actually have to insert their // control-flow patterns. @@ -6640,7 +7002,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineFunction *F = BB->getParent(); - if (PPCSubTarget.hasISEL() && (MI->getOpcode() == PPC::SELECT_CC_I4 || + if (Subtarget.hasISEL() && (MI->getOpcode() == PPC::SELECT_CC_I4 || MI->getOpcode() == PPC::SELECT_CC_I8 || MI->getOpcode() == PPC::SELECT_I4 || MI->getOpcode() == PPC::SELECT_I8)) { @@ -6653,7 +7015,8 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, Cond.push_back(MI->getOperand(1)); DebugLoc dl = MI->getDebugLoc(); - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = + getTargetMachine().getSubtargetImpl()->getInstrInfo(); TII->insertSelect(*BB, MI, dl, MI->getOperand(0).getReg(), Cond, MI->getOperand(2).getReg(), MI->getOperand(3).getReg()); @@ -6760,13 +7123,13 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, BB = EmitAtomicBinary(MI, BB, true, PPC::XOR8); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I8) - BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ANDC); + BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I16) - BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ANDC); + BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I32) - BB = EmitAtomicBinary(MI, BB, false, PPC::ANDC); + BB = EmitAtomicBinary(MI, BB, false, PPC::NAND); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I64) - BB = EmitAtomicBinary(MI, BB, true, PPC::ANDC8); + BB = EmitAtomicBinary(MI, BB, true, PPC::NAND8); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF); @@ -6857,7 +7220,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // We must use 64-bit registers for addresses when targeting 64-bit, // since we're actually doing arithmetic on them. Other registers // can be 32-bit. - bool is64bit = PPCSubTarget.isPPC64(); + bool is64bit = Subtarget.isPPC64(); bool is8bit = MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I8; unsigned dest = MI->getOperand(0).getReg(); @@ -7065,10 +7428,10 @@ SDValue PPCTargetLowering::DAGCombineFastRecip(SDValue Op, EVT VT = Op.getValueType(); - if ((VT == MVT::f32 && PPCSubTarget.hasFRES()) || - (VT == MVT::f64 && PPCSubTarget.hasFRE()) || - (VT == MVT::v4f32 && PPCSubTarget.hasAltivec()) || - (VT == MVT::v2f64 && PPCSubTarget.hasVSX())) { + if ((VT == MVT::f32 && Subtarget.hasFRES()) || + (VT == MVT::f64 && Subtarget.hasFRE()) || + (VT == MVT::v4f32 && Subtarget.hasAltivec()) || + (VT == MVT::v2f64 && Subtarget.hasVSX())) { // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) // For the reciprocal, we need to find the zero of the function: @@ -7081,7 +7444,7 @@ SDValue PPCTargetLowering::DAGCombineFastRecip(SDValue Op, // correct after every iteration. The minimum architected relative // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has // 23 digits and double has 52 digits. - int Iterations = PPCSubTarget.hasRecipPrec() ? 1 : 3; + int Iterations = Subtarget.hasRecipPrec() ? 1 : 3; if (VT.getScalarType() == MVT::f64) ++Iterations; @@ -7128,10 +7491,10 @@ SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op, EVT VT = Op.getValueType(); - if ((VT == MVT::f32 && PPCSubTarget.hasFRSQRTES()) || - (VT == MVT::f64 && PPCSubTarget.hasFRSQRTE()) || - (VT == MVT::v4f32 && PPCSubTarget.hasAltivec()) || - (VT == MVT::v2f64 && PPCSubTarget.hasVSX())) { + if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) || + (VT == MVT::f64 && Subtarget.hasFRSQRTE()) || + (VT == MVT::v4f32 && Subtarget.hasAltivec()) || + (VT == MVT::v2f64 && Subtarget.hasVSX())) { // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) // For the reciprocal sqrt, we need to find the zero of the function: @@ -7144,7 +7507,7 @@ SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op, // correct after every iteration. The minimum architected relative // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has // 23 digits and double has 52 digits. - int Iterations = PPCSubTarget.hasRecipPrec() ? 1 : 3; + int Iterations = Subtarget.hasRecipPrec() ? 1 : 3; if (VT.getScalarType() == MVT::f64) ++Iterations; @@ -7193,16 +7556,12 @@ SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op, return SDValue(); } -// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does -// not enforce equality of the chain operands. -static bool isConsecutiveLS(LSBaseSDNode *LS, LSBaseSDNode *Base, +static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base, unsigned Bytes, int Dist, SelectionDAG &DAG) { - EVT VT = LS->getMemoryVT(); if (VT.getSizeInBits() / 8 != Bytes) return false; - SDValue Loc = LS->getBasePtr(); SDValue BaseLoc = Base->getBasePtr(); if (Loc.getOpcode() == ISD::FrameIndex) { if (BaseLoc.getOpcode() != ISD::FrameIndex) @@ -7233,11 +7592,69 @@ static bool isConsecutiveLS(LSBaseSDNode *LS, LSBaseSDNode *Base, return false; } +// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does +// not enforce equality of the chain operands. +static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base, + unsigned Bytes, int Dist, + SelectionDAG &DAG) { + if (LSBaseSDNode *LS = dyn_cast(N)) { + EVT VT = LS->getMemoryVT(); + SDValue Loc = LS->getBasePtr(); + return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG); + } + + if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) { + EVT VT; + switch (cast(N->getOperand(1))->getZExtValue()) { + default: return false; + case Intrinsic::ppc_altivec_lvx: + case Intrinsic::ppc_altivec_lvxl: + VT = MVT::v4i32; + break; + case Intrinsic::ppc_altivec_lvebx: + VT = MVT::i8; + break; + case Intrinsic::ppc_altivec_lvehx: + VT = MVT::i16; + break; + case Intrinsic::ppc_altivec_lvewx: + VT = MVT::i32; + break; + } + + return isConsecutiveLSLoc(N->getOperand(2), VT, Base, Bytes, Dist, DAG); + } + + if (N->getOpcode() == ISD::INTRINSIC_VOID) { + EVT VT; + switch (cast(N->getOperand(1))->getZExtValue()) { + default: return false; + case Intrinsic::ppc_altivec_stvx: + case Intrinsic::ppc_altivec_stvxl: + VT = MVT::v4i32; + break; + case Intrinsic::ppc_altivec_stvebx: + VT = MVT::i8; + break; + case Intrinsic::ppc_altivec_stvehx: + VT = MVT::i16; + break; + case Intrinsic::ppc_altivec_stvewx: + VT = MVT::i32; + break; + } + + return isConsecutiveLSLoc(N->getOperand(3), VT, Base, Bytes, Dist, DAG); + } + + return false; +} + // Return true is there is a nearyby consecutive load to the one provided // (regardless of alignment). We search up and down the chain, looking though -// token factors and other loads (but nothing else). As a result, a true -// results indicates that it is safe to create a new consecutive load adjacent -// to the load provided. +// token factors and other loads (but nothing else). As a result, a true result +// indicates that it is safe to create a new consecutive load adjacent to the +// load provided. static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { SDValue Chain = LD->getChain(); EVT VT = LD->getMemoryVT(); @@ -7254,17 +7671,16 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { if (!Visited.insert(ChainNext)) continue; - if (LoadSDNode *ChainLD = dyn_cast(ChainNext)) { + if (MemSDNode *ChainLD = dyn_cast(ChainNext)) { if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG)) return true; if (!Visited.count(ChainLD->getChain().getNode())) Queue.push_back(ChainLD->getChain().getNode()); } else if (ChainNext->getOpcode() == ISD::TokenFactor) { - for (SDNode::op_iterator O = ChainNext->op_begin(), - OE = ChainNext->op_end(); O != OE; ++O) - if (!Visited.count(O->getNode())) - Queue.push_back(O->getNode()); + for (const SDUse &O : ChainNext->ops()) + if (!Visited.count(O.getNode())) + Queue.push_back(O.getNode()); } else LoadRoots.insert(ChainNext); } @@ -7286,14 +7702,14 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { if (!Visited.insert(LoadRoot)) continue; - if (LoadSDNode *ChainLD = dyn_cast(LoadRoot)) + if (MemSDNode *ChainLD = dyn_cast(LoadRoot)) if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG)) return true; for (SDNode::use_iterator UI = LoadRoot->use_begin(), UE = LoadRoot->use_end(); UI != UE; ++UI) - if (((isa(*UI) && - cast(*UI)->getChain().getNode() == LoadRoot) || + if (((isa(*UI) && + cast(*UI)->getChain().getNode() == LoadRoot) || UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI)) Queue.push_back(*UI); } @@ -7307,7 +7723,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N, SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); - assert(PPCSubTarget.useCRBits() && + assert(Subtarget.useCRBits() && "Expecting to be tracking CR bits"); // If we're tracking CR bits, we need to be careful that we don't have: // trunc(binary-ops(zext(x), zext(y))) @@ -7605,9 +8021,9 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, return SDValue(); if (!((N->getOperand(0).getValueType() == MVT::i1 && - PPCSubTarget.useCRBits()) || + Subtarget.useCRBits()) || (N->getOperand(0).getValueType() == MVT::i32 && - PPCSubTarget.isPPC64()))) + Subtarget.isPPC64()))) return SDValue(); if (N->getOperand(0).getOpcode() != ISD::AND && @@ -7925,8 +8341,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, DCI.AddToWorklist(RV.getNode()); RV = DAGCombineFastRecip(RV, DCI); if (RV.getNode()) { - // Unfortunately, RV is now NaN if the input was exactly 0. Select out - // this case and force the answer to 0. + // Unfortunately, RV is now NaN if the input was exactly 0. Select out + // this case and force the answer to 0. EVT VT = RV.getValueType(); @@ -8046,6 +8462,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, // This is a type-legal unaligned Altivec load. SDValue Chain = LD->getChain(); SDValue Ptr = LD->getBasePtr(); + bool isLittleEndian = Subtarget.isLittleEndian(); // This implements the loading of unaligned vectors as described in // the venerable Apple Velocity Engine overview. Specifically: @@ -8053,37 +8470,48 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html // // The general idea is to expand a sequence of one or more unaligned - // loads into a alignment-based permutation-control instruction (lvsl), - // a series of regular vector loads (which always truncate their - // input address to an aligned address), and a series of permutations. - // The results of these permutations are the requested loaded values. - // The trick is that the last "extra" load is not taken from the address - // you might suspect (sizeof(vector) bytes after the last requested - // load), but rather sizeof(vector) - 1 bytes after the last - // requested vector. The point of this is to avoid a page fault if the - // base address happened to be aligned. This works because if the base - // address is aligned, then adding less than a full vector length will - // cause the last vector in the sequence to be (re)loaded. Otherwise, - // the next vector will be fetched as you might suspect was necessary. + // loads into an alignment-based permutation-control instruction (lvsl + // or lvsr), a series of regular vector loads (which always truncate + // their input address to an aligned address), and a series of + // permutations. The results of these permutations are the requested + // loaded values. The trick is that the last "extra" load is not taken + // from the address you might suspect (sizeof(vector) bytes after the + // last requested load), but rather sizeof(vector) - 1 bytes after the + // last requested vector. The point of this is to avoid a page fault if + // the base address happened to be aligned. This works because if the + // base address is aligned, then adding less than a full vector length + // will cause the last vector in the sequence to be (re)loaded. + // Otherwise, the next vector will be fetched as you might suspect was + // necessary. // We might be able to reuse the permutation generation from // a different base address offset from this one by an aligned amount. // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this // optimization later. - SDValue PermCntl = BuildIntrinsicOp(Intrinsic::ppc_altivec_lvsl, Ptr, - DAG, dl, MVT::v16i8); - - // Refine the alignment of the original load (a "new" load created here - // which was identical to the first except for the alignment would be - // merged with the existing node regardless). + Intrinsic::ID Intr = (isLittleEndian ? + Intrinsic::ppc_altivec_lvsr : + Intrinsic::ppc_altivec_lvsl); + SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, MVT::v16i8); + + // Create the new MMO for the new base load. It is like the original MMO, + // but represents an area in memory almost twice the vector size centered + // on the original address. If the address is unaligned, we might start + // reading up to (sizeof(vector)-1) bytes below the address of the + // original unaligned load. MachineFunction &MF = DAG.getMachineFunction(); - MachineMemOperand *MMO = - MF.getMachineMemOperand(LD->getPointerInfo(), - LD->getMemOperand()->getFlags(), - LD->getMemoryVT().getStoreSize(), - ABIAlignment); - LD->refineAlignment(MMO); - SDValue BaseLoad = SDValue(LD, 0); + MachineMemOperand *BaseMMO = + MF.getMachineMemOperand(LD->getMemOperand(), + -LD->getMemoryVT().getStoreSize()+1, + 2*LD->getMemoryVT().getStoreSize()-1); + + // Create the new base load. + SDValue LDXIntID = DAG.getTargetConstant(Intrinsic::ppc_altivec_lvx, + getPointerTy()); + SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr }; + SDValue BaseLoad = + DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, + DAG.getVTList(MVT::v4i32, MVT::Other), + BaseLoadOps, MVT::v4i32, BaseMMO); // Note that the value of IncOffset (which is provided to the next // load's pointer info offset value, and thus used to calculate the @@ -8105,65 +8533,47 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, SDValue Increment = DAG.getConstant(IncValue, getPointerTy()); Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + MachineMemOperand *ExtraMMO = + MF.getMachineMemOperand(LD->getMemOperand(), + 1, 2*LD->getMemoryVT().getStoreSize()-1); + SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr }; SDValue ExtraLoad = - DAG.getLoad(VT, dl, Chain, Ptr, - LD->getPointerInfo().getWithOffset(IncOffset), - LD->isVolatile(), LD->isNonTemporal(), - LD->isInvariant(), ABIAlignment); + DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, + DAG.getVTList(MVT::v4i32, MVT::Other), + ExtraLoadOps, MVT::v4i32, ExtraMMO); SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, BaseLoad.getValue(1), ExtraLoad.getValue(1)); - if (BaseLoad.getValueType() != MVT::v4i32) - BaseLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, BaseLoad); - - if (ExtraLoad.getValueType() != MVT::v4i32) - ExtraLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ExtraLoad); - - SDValue Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm, - BaseLoad, ExtraLoad, PermCntl, DAG, dl); + // Because vperm has a big-endian bias, we must reverse the order + // of the input vectors and complement the permute control vector + // when generating little endian code. We have already handled the + // latter by using lvsr instead of lvsl, so just reverse BaseLoad + // and ExtraLoad here. + SDValue Perm; + if (isLittleEndian) + Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm, + ExtraLoad, BaseLoad, PermCntl, DAG, dl); + else + Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm, + BaseLoad, ExtraLoad, PermCntl, DAG, dl); if (VT != MVT::v4i32) Perm = DAG.getNode(ISD::BITCAST, dl, VT, Perm); - // Now we need to be really careful about how we update the users of the - // original load. We cannot just call DCI.CombineTo (or - // DAG.ReplaceAllUsesWith for that matter), because the load still has - // uses created here (the permutation for example) that need to stay. - SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); - while (UI != UE) { - SDUse &Use = UI.getUse(); - SDNode *User = *UI; - // Note: BaseLoad is checked here because it might not be N, but a - // bitcast of N. - if (User == Perm.getNode() || User == BaseLoad.getNode() || - User == TF.getNode() || Use.getResNo() > 1) { - ++UI; - continue; - } - - SDValue To = Use.getResNo() ? TF : Perm; - ++UI; - - SmallVector Ops; - for (SDNode::op_iterator O = User->op_begin(), - OE = User->op_end(); O != OE; ++O) { - if (*O == Use) - Ops.push_back(To); - else - Ops.push_back(*O); - } - - DAG.UpdateNodeOperands(User, Ops); - } - + // The output of the permutation is our loaded result, the TokenFactor is + // our new chain. + DCI.CombineTo(N, Perm, TF); return SDValue(N, 0); } } break; - case ISD::INTRINSIC_WO_CHAIN: - if (cast(N->getOperand(0))->getZExtValue() == - Intrinsic::ppc_altivec_lvsl && + case ISD::INTRINSIC_WO_CHAIN: { + bool isLittleEndian = Subtarget.isLittleEndian(); + Intrinsic::ID Intr = (isLittleEndian ? + Intrinsic::ppc_altivec_lvsr : + Intrinsic::ppc_altivec_lvsl); + if (cast(N->getOperand(0))->getZExtValue() == Intr && N->getOperand(1)->getOpcode() == ISD::ADD) { SDValue Add = N->getOperand(1); @@ -8175,8 +8585,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, UE = BasePtr->use_end(); UI != UE; ++UI) { if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN && cast(UI->getOperand(0))->getZExtValue() == - Intrinsic::ppc_altivec_lvsl) { - // We've found another LVSL, and this address if an aligned + Intr) { + // We've found another LVSL/LVSR, and this address is an aligned // multiple of that one. The results will be the same, so use the // one we've just found instead. @@ -8185,6 +8595,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } } } + } break; case ISD::BSWAP: @@ -8532,11 +8943,11 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, // GCC RS6000 Constraint Letters switch (Constraint[0]) { case 'b': // R1-R31 - if (VT == MVT::i64 && PPCSubTarget.isPPC64()) + if (VT == MVT::i64 && Subtarget.isPPC64()) return std::make_pair(0U, &PPC::G8RC_NOX0RegClass); return std::make_pair(0U, &PPC::GPRC_NOR0RegClass); case 'r': // R0-R31 - if (VT == MVT::i64 && PPCSubTarget.isPPC64()) + if (VT == MVT::i64 && Subtarget.isPPC64()) return std::make_pair(0U, &PPC::G8RCRegClass); return std::make_pair(0U, &PPC::GPRCRegClass); case 'f': @@ -8568,9 +8979,10 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, // register. // FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use // the AsmName field from *RegisterInfo.td, then this would not be necessary. - if (R.first && VT == MVT::i64 && PPCSubTarget.isPPC64() && + if (R.first && VT == MVT::i64 && Subtarget.isPPC64() && PPC::GPRCRegClass.contains(R.first)) { - const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const TargetRegisterInfo *TRI = + getTargetMachine().getSubtargetImpl()->getRegisterInfo(); return std::make_pair(TRI->getMatchingSuperReg(R.first, PPC::sub_32, &PPC::G8RCRegClass), &PPC::G8RCRegClass); @@ -8702,8 +9114,8 @@ SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op, // the stack. PPCFunctionInfo *FuncInfo = MF.getInfo(); FuncInfo->setLRStoreRequired(); - bool isPPC64 = PPCSubTarget.isPPC64(); - bool isDarwinABI = PPCSubTarget.isDarwinABI(); + bool isPPC64 = Subtarget.isPPC64(); + bool isDarwinABI = Subtarget.isDarwinABI(); if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); @@ -8757,8 +9169,8 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op, // this table could be generated automatically from RegInfo. unsigned PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT) const { - bool isPPC64 = PPCSubTarget.isPPC64(); - bool isDarwinABI = PPCSubTarget.isDarwinABI(); + bool isPPC64 = Subtarget.isPPC64(); + bool isDarwinABI = Subtarget.isDarwinABI(); if ((isPPC64 && VT != MVT::i64 && VT != MVT::i32) || (!isPPC64 && VT != MVT::i32)) @@ -8783,6 +9195,82 @@ PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { return false; } +bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, + unsigned Intrinsic) const { + + switch (Intrinsic) { + case Intrinsic::ppc_altivec_lvx: + case Intrinsic::ppc_altivec_lvxl: + case Intrinsic::ppc_altivec_lvebx: + case Intrinsic::ppc_altivec_lvehx: + case Intrinsic::ppc_altivec_lvewx: { + EVT VT; + switch (Intrinsic) { + case Intrinsic::ppc_altivec_lvebx: + VT = MVT::i8; + break; + case Intrinsic::ppc_altivec_lvehx: + VT = MVT::i16; + break; + case Intrinsic::ppc_altivec_lvewx: + VT = MVT::i32; + break; + default: + VT = MVT::v4i32; + break; + } + + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = VT; + Info.ptrVal = I.getArgOperand(0); + Info.offset = -VT.getStoreSize()+1; + Info.size = 2*VT.getStoreSize()-1; + Info.align = 1; + Info.vol = false; + Info.readMem = true; + Info.writeMem = false; + return true; + } + case Intrinsic::ppc_altivec_stvx: + case Intrinsic::ppc_altivec_stvxl: + case Intrinsic::ppc_altivec_stvebx: + case Intrinsic::ppc_altivec_stvehx: + case Intrinsic::ppc_altivec_stvewx: { + EVT VT; + switch (Intrinsic) { + case Intrinsic::ppc_altivec_stvebx: + VT = MVT::i8; + break; + case Intrinsic::ppc_altivec_stvehx: + VT = MVT::i16; + break; + case Intrinsic::ppc_altivec_stvewx: + VT = MVT::i32; + break; + default: + VT = MVT::v4i32; + break; + } + + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = VT; + Info.ptrVal = I.getArgOperand(1); + Info.offset = -VT.getStoreSize()+1; + Info.size = 2*VT.getStoreSize()-1; + Info.align = 1; + Info.vol = false; + Info.readMem = false; + Info.writeMem = true; + return true; + } + default: + break; + } + + return false; +} + /// getOptimalMemOpType - Returns the target specific optimal type for load /// and store operations as a result of memset, memcpy, and memmove /// lowering. If DstAlign is zero that means it's safe to destination @@ -8799,7 +9287,7 @@ EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const { - if (this->PPCSubTarget.isPPC64()) { + if (Subtarget.isPPC64()) { return MVT::i64; } else { return MVT::i32; @@ -8842,9 +9330,10 @@ bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const { return isInt<16>(Imm) || isUInt<16>(Imm); } -bool PPCTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, - unsigned, - bool *Fast) const { +bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned, + unsigned, + bool *Fast) const { if (DisablePPCUnaligned) return false; @@ -8858,7 +9347,7 @@ bool PPCTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, return false; if (VT.getSimpleVT().isVector()) { - if (PPCSubTarget.hasVSX()) { + if (Subtarget.hasVSX()) { if (VT != MVT::v2f64 && VT != MVT::v2i64) return false; } else { @@ -8902,7 +9391,7 @@ PPCTargetLowering::shouldExpandBuildVectorWithShuffles( } Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const { - if (DisableILPPref || PPCSubTarget.enableMachineScheduler()) + if (DisableILPPref || Subtarget.enableMachineScheduler()) return TargetLowering::getSchedulingPreference(N); return Sched::ILP;