X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FPowerPC%2FPPCISelLowering.cpp;h=36e1e1334e5b1a83925aeede13ef3580b7870cf2;hb=aa5b9c0f6f3a99f955fe0ded13d61d7eb4e1a0b5;hp=db2003fbe03ec69a9dbcc881fc7aa3c01f31a2f6;hpb=b053ddc90906cdf880414dd9bfcfa22a9685633a;p=oota-llvm.git diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index db2003fbe03..36e1e1334e5 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -453,6 +453,8 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); setOperationAction(ISD::BUILD_VECTOR, VT, Expand); + setOperationAction(ISD::MULHU, VT, Expand); + setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); @@ -526,11 +528,6 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // Altivec does not contain unordered floating-point compare instructions setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand); setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand); - setCondCodeAction(ISD::SETUGT, MVT::v4f32, Expand); - setCondCodeAction(ISD::SETUGE, MVT::v4f32, Expand); - setCondCodeAction(ISD::SETULT, MVT::v4f32, Expand); - setCondCodeAction(ISD::SETULE, MVT::v4f32, Expand); - setCondCodeAction(ISD::SETO, MVT::v4f32, Expand); setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand); @@ -561,11 +558,6 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // Share the Altivec comparison restrictions. setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand); setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand); - setCondCodeAction(ISD::SETUGT, MVT::v2f64, Expand); - setCondCodeAction(ISD::SETUGE, MVT::v2f64, Expand); - setCondCodeAction(ISD::SETULT, MVT::v2f64, Expand); - setCondCodeAction(ISD::SETULE, MVT::v2f64, Expand); - setCondCodeAction(ISD::SETO, MVT::v2f64, Expand); setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand); @@ -626,6 +618,13 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // Altivec instructions set fields to all zeros or all ones. setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); + if (!isPPC64) { + // These libcalls are not available in 32-bit. + setLibcallName(RTLIB::SHL_I128, nullptr); + setLibcallName(RTLIB::SRL_I128, nullptr); + setLibcallName(RTLIB::SRA_I128, nullptr); + } + if (isPPC64) { setStackPointerRegisterToSaveRestore(PPC::X1); setExceptionPointerRegister(PPC::X3); @@ -853,14 +852,27 @@ static bool isConstantOrUndef(int Op, int Val) { /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a /// VPKUHUM instruction. -bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary, +/// The ShuffleKind distinguishes between big-endian operations with +/// two different inputs (0), either-endian operations with two identical +/// inputs (1), and little-endian operantion with two different inputs (2). +/// For the latter, the input operands are swapped (see PPCInstrAltivec.td). +bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, SelectionDAG &DAG) { - unsigned j = DAG.getTarget().getDataLayout()->isLittleEndian() ? 0 : 1; - if (!isUnary) { + bool IsLE = DAG.getSubtarget().getDataLayout()->isLittleEndian(); + if (ShuffleKind == 0) { + if (IsLE) + return false; for (unsigned i = 0; i != 16; ++i) - if (!isConstantOrUndef(N->getMaskElt(i), i*2+j)) + if (!isConstantOrUndef(N->getMaskElt(i), i*2+1)) return false; - } else { + } else if (ShuffleKind == 2) { + if (!IsLE) + return false; + for (unsigned i = 0; i != 16; ++i) + if (!isConstantOrUndef(N->getMaskElt(i), i*2)) + return false; + } else if (ShuffleKind == 1) { + unsigned j = IsLE ? 0 : 1; for (unsigned i = 0; i != 8; ++i) if (!isConstantOrUndef(N->getMaskElt(i), i*2+j) || !isConstantOrUndef(N->getMaskElt(i+8), i*2+j)) @@ -871,27 +883,34 @@ bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary, /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a /// VPKUWUM instruction. -bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary, +/// The ShuffleKind distinguishes between big-endian operations with +/// two different inputs (0), either-endian operations with two identical +/// inputs (1), and little-endian operantion with two different inputs (2). +/// For the latter, the input operands are swapped (see PPCInstrAltivec.td). +bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, SelectionDAG &DAG) { - unsigned j, k; - if (DAG.getTarget().getDataLayout()->isLittleEndian()) { - j = 0; - k = 1; - } else { - j = 2; - k = 3; - } - if (!isUnary) { + bool IsLE = DAG.getSubtarget().getDataLayout()->isLittleEndian(); + if (ShuffleKind == 0) { + if (IsLE) + return false; for (unsigned i = 0; i != 16; i += 2) - if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) || - !isConstantOrUndef(N->getMaskElt(i+1), i*2+k)) + if (!isConstantOrUndef(N->getMaskElt(i ), i*2+2) || + !isConstantOrUndef(N->getMaskElt(i+1), i*2+3)) return false; - } else { + } else if (ShuffleKind == 2) { + if (!IsLE) + return false; + for (unsigned i = 0; i != 16; i += 2) + if (!isConstantOrUndef(N->getMaskElt(i ), i*2) || + !isConstantOrUndef(N->getMaskElt(i+1), i*2+1)) + return false; + } else if (ShuffleKind == 1) { + unsigned j = IsLE ? 0 : 2; for (unsigned i = 0; i != 8; i += 2) - if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) || - !isConstantOrUndef(N->getMaskElt(i+1), i*2+k) || - !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) || - !isConstantOrUndef(N->getMaskElt(i+9), i*2+k)) + if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) || + !isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) || + !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) || + !isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1)) return false; } return true; @@ -919,38 +938,63 @@ static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize, /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for /// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes). +/// The ShuffleKind distinguishes between big-endian merges with two +/// different inputs (0), either-endian merges with two identical inputs (1), +/// and little-endian merges with two different inputs (2). For the latter, +/// the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, - bool isUnary, SelectionDAG &DAG) { - if (DAG.getTarget().getDataLayout()->isLittleEndian()) { - if (!isUnary) + unsigned ShuffleKind, SelectionDAG &DAG) { + if (DAG.getSubtarget().getDataLayout()->isLittleEndian()) { + if (ShuffleKind == 1) // unary + return isVMerge(N, UnitSize, 0, 0); + else if (ShuffleKind == 2) // swapped return isVMerge(N, UnitSize, 0, 16); - return isVMerge(N, UnitSize, 0, 0); + else + return false; } else { - if (!isUnary) + if (ShuffleKind == 1) // unary + return isVMerge(N, UnitSize, 8, 8); + else if (ShuffleKind == 0) // normal return isVMerge(N, UnitSize, 8, 24); - return isVMerge(N, UnitSize, 8, 8); + else + return false; } } /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for /// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes). +/// The ShuffleKind distinguishes between big-endian merges with two +/// different inputs (0), either-endian merges with two identical inputs (1), +/// and little-endian merges with two different inputs (2). For the latter, +/// the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, - bool isUnary, SelectionDAG &DAG) { - if (DAG.getTarget().getDataLayout()->isLittleEndian()) { - if (!isUnary) + unsigned ShuffleKind, SelectionDAG &DAG) { + if (DAG.getSubtarget().getDataLayout()->isLittleEndian()) { + if (ShuffleKind == 1) // unary + return isVMerge(N, UnitSize, 8, 8); + else if (ShuffleKind == 2) // swapped return isVMerge(N, UnitSize, 8, 24); - return isVMerge(N, UnitSize, 8, 8); + else + return false; } else { - if (!isUnary) + if (ShuffleKind == 1) // unary + return isVMerge(N, UnitSize, 0, 0); + else if (ShuffleKind == 0) // normal return isVMerge(N, UnitSize, 0, 16); - return isVMerge(N, UnitSize, 0, 0); + else + return false; } } /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift /// amount, otherwise return -1. -int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary, SelectionDAG &DAG) { +/// The ShuffleKind distinguishes between big-endian operations with two +/// different inputs (0), either-endian operations with two identical inputs +/// (1), and little-endian operations with two different inputs (2). For the +/// latter, the input operands are swapped (see PPCInstrAltivec.td). +int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind, + SelectionDAG &DAG) { if (N->getValueType(0) != MVT::v16i8) return -1; @@ -968,38 +1012,26 @@ int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary, SelectionDAG &DAG) { unsigned ShiftAmt = SVOp->getMaskElt(i); if (ShiftAmt < i) return -1; - if (DAG.getTarget().getDataLayout()->isLittleEndian()) { - - ShiftAmt += i; - - if (!isUnary) { - // Check the rest of the elements to see if they are consecutive. - for (++i; i != 16; ++i) - if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt - i)) - return -1; - } else { - // Check the rest of the elements to see if they are consecutive. - for (++i; i != 16; ++i) - if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt - i) & 15)) - return -1; - } - - } else { // Big Endian + ShiftAmt -= i; + bool isLE = DAG.getTarget().getSubtargetImpl()->getDataLayout()-> + isLittleEndian(); + + if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) { + // Check the rest of the elements to see if they are consecutive. + for (++i; i != 16; ++i) + if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i)) + return -1; + } else if (ShuffleKind == 1) { + // Check the rest of the elements to see if they are consecutive. + for (++i; i != 16; ++i) + if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15)) + return -1; + } else + return -1; - ShiftAmt -= i; + if (ShuffleKind == 2 && isLE) + ShiftAmt = 16 - ShiftAmt; - if (!isUnary) { - // Check the rest of the elements to see if they are consecutive. - for (++i; i != 16; ++i) - if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i)) - return -1; - } else { - // Check the rest of the elements to see if they are consecutive. - for (++i; i != 16; ++i) - if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15)) - return -1; - } - } return ShiftAmt; } @@ -1055,7 +1087,7 @@ unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize, SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast(N); assert(isSplatShuffleMask(SVOp, EltSize)); - if (DAG.getTarget().getDataLayout()->isLittleEndian()) + if (DAG.getSubtarget().getDataLayout()->isLittleEndian()) return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize); else return SVOp->getMaskElt(0) / EltSize; @@ -1331,7 +1363,13 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) { // If all of the bits are known zero on the LHS or RHS, the add won't // carry. - Base = N.getOperand(0); + if (FrameIndexSDNode *FI = + dyn_cast(N.getOperand(0))) { + Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); + fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); + } else { + Base = N.getOperand(0); + } Disp = DAG.getTargetConstant(imm, N.getValueType()); return true; } @@ -1491,10 +1529,9 @@ static bool GetLabelAccessInfo(const TargetMachine &TM, unsigned &HiOpFlags, HiOpFlags = PPCII::MO_HA; LoOpFlags = PPCII::MO_LO; - // Don't use the pic base if not in PIC relocation model. Or if we are on a - // non-darwin platform. We don't support PIC on other platforms yet. - bool isPIC = TM.getRelocationModel() == Reloc::PIC_ && - TM.getSubtarget().isDarwin(); + // Don't use the pic base if not in PIC relocation model. + bool isPIC = TM.getRelocationModel() == Reloc::PIC_; + if (isPIC) { HiOpFlags |= PPCII::MO_PIC_FLAG; LoOpFlags |= PPCII::MO_PIC_FLAG; @@ -1550,6 +1587,15 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op, unsigned MOHiFlag, MOLoFlag; bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag); + + if (isPIC && Subtarget.isSVR4ABI()) { + SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), + PPCII::MO_PIC_FLAG); + SDLoc DL(CP); + return DAG.getNode(PPCISD::TOC_ENTRY, DL, MVT::i32, GA, + DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT)); + } + SDValue CPIHi = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOHiFlag); SDValue CPILo = @@ -1571,6 +1617,15 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { unsigned MOHiFlag, MOLoFlag; bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag); + + if (isPIC && Subtarget.isSVR4ABI()) { + SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, + PPCII::MO_PIC_FLAG); + SDLoc DL(GA); + return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(JT), PtrVT, GA, + DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT)); + } + SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag); SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag); return LowerLabelRef(JTIHi, JTILo, isPIC, DAG); @@ -1633,47 +1688,61 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, if (Model == TLSModel::GeneralDynamic) { SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); - SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); - SDValue GOTEntryHi = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT, - GOTReg, TGA); + SDValue GOTPtr; + if (is64bit) { + SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); + GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT, + GOTReg, TGA); + } else { + GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); + } SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSGD_L, dl, PtrVT, - GOTEntryHi, TGA); + GOTPtr, TGA); // We need a chain node, and don't have one handy. The underlying // call has no side effects, so using the function entry node // suffices. SDValue Chain = DAG.getEntryNode(); - Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, GOTEntry); - SDValue ParmReg = DAG.getRegister(PPC::X3, MVT::i64); + Chain = DAG.getCopyToReg(Chain, dl, + is64bit ? PPC::X3 : PPC::R3, GOTEntry); + SDValue ParmReg = DAG.getRegister(is64bit ? PPC::X3 : PPC::R3, + is64bit ? MVT::i64 : MVT::i32); SDValue TLSAddr = DAG.getNode(PPCISD::GET_TLS_ADDR, dl, PtrVT, ParmReg, TGA); // The return value from GET_TLS_ADDR really is in X3 already, but // some hacks are needed here to tie everything together. The extra // copies dissolve during subsequent transforms. - Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, TLSAddr); - return DAG.getCopyFromReg(Chain, dl, PPC::X3, PtrVT); + Chain = DAG.getCopyToReg(Chain, dl, is64bit ? PPC::X3 : PPC::R3, TLSAddr); + return DAG.getCopyFromReg(Chain, dl, is64bit ? PPC::X3 : PPC::R3, PtrVT); } if (Model == TLSModel::LocalDynamic) { SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); - SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); - SDValue GOTEntryHi = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT, - GOTReg, TGA); + SDValue GOTPtr; + if (is64bit) { + SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); + GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT, + GOTReg, TGA); + } else { + GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); + } SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSLD_L, dl, PtrVT, - GOTEntryHi, TGA); + GOTPtr, TGA); // We need a chain node, and don't have one handy. The underlying // call has no side effects, so using the function entry node // suffices. SDValue Chain = DAG.getEntryNode(); - Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, GOTEntry); - SDValue ParmReg = DAG.getRegister(PPC::X3, MVT::i64); + Chain = DAG.getCopyToReg(Chain, dl, + is64bit ? PPC::X3 : PPC::R3, GOTEntry); + SDValue ParmReg = DAG.getRegister(is64bit ? PPC::X3 : PPC::R3, + is64bit ? MVT::i64 : MVT::i32); SDValue TLSAddr = DAG.getNode(PPCISD::GET_TLSLD_ADDR, dl, PtrVT, ParmReg, TGA); // The return value from GET_TLSLD_ADDR really is in X3 already, but // some hacks are needed here to tie everything together. The extra // copies dissolve during subsequent transforms. - Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, TLSAddr); + Chain = DAG.getCopyToReg(Chain, dl, is64bit ? PPC::X3 : PPC::R3, TLSAddr); SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl, PtrVT, Chain, ParmReg, TGA); return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA); @@ -1700,6 +1769,14 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op, unsigned MOHiFlag, MOLoFlag; bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag, GV); + if (isPIC && Subtarget.isSVR4ABI()) { + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, + GSDN->getOffset(), + PPCII::MO_PIC_FLAG); + return DAG.getNode(PPCISD::TOC_ENTRY, DL, MVT::i32, GA, + DAG.getNode(PPCISD::GlobalBaseReg, DL, MVT::i32)); + } + SDValue GAHi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag); SDValue GALo = @@ -1794,7 +1871,7 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG, // gpr_index SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain, VAListPtr, MachinePointerInfo(SV), MVT::i8, - false, false, 0); + false, false, false, 0); InChain = GprIndex.getValue(1); if (VT == MVT::i64) { @@ -1817,7 +1894,7 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG, // fpr SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain, FprPtr, MachinePointerInfo(SV), MVT::i8, - false, false, 0); + false, false, false, 0); InChain = FprIndex.getValue(1); SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, @@ -2127,14 +2204,19 @@ static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags, unsigned ArgSize = ArgVT.getStoreSize(); if (Flags.isByVal()) ArgSize = Flags.getByValSize(); - ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + + // Round up to multiples of the pointer size, except for array members, + // which are always packed. + if (!Flags.isInConsecutiveRegs()) + ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; return ArgSize; } /// CalculateStackSlotAlignment - Calculates the alignment of this argument /// on the stack. -static unsigned CalculateStackSlotAlignment(EVT ArgVT, ISD::ArgFlagsTy Flags, +static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT, + ISD::ArgFlagsTy Flags, unsigned PtrByteSize) { unsigned Align = PtrByteSize; @@ -2156,14 +2238,78 @@ static unsigned CalculateStackSlotAlignment(EVT ArgVT, ISD::ArgFlagsTy Flags, } } + // Array members are always packed to their original alignment. + if (Flags.isInConsecutiveRegs()) { + // If the array member was split into multiple registers, the first + // needs to be aligned to the size of the full type. (Except for + // ppcf128, which is only aligned as its f64 components.) + if (Flags.isSplit() && OrigVT != MVT::ppcf128) + Align = OrigVT.getStoreSize(); + else + Align = ArgVT.getStoreSize(); + } + return Align; } +/// CalculateStackSlotUsed - Return whether this argument will use its +/// stack slot (instead of being passed in registers). ArgOffset, +/// AvailableFPRs, and AvailableVRs must hold the current argument +/// position, and will be updated to account for this argument. +static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, + ISD::ArgFlagsTy Flags, + unsigned PtrByteSize, + unsigned LinkageSize, + unsigned ParamAreaSize, + unsigned &ArgOffset, + unsigned &AvailableFPRs, + unsigned &AvailableVRs) { + bool UseMemory = false; + + // Respect alignment of argument on the stack. + unsigned Align = + CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); + ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; + // If there's no space left in the argument save area, we must + // use memory (this check also catches zero-sized arguments). + if (ArgOffset >= LinkageSize + ParamAreaSize) + UseMemory = true; + + // Allocate argument on the stack. + ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); + if (Flags.isInConsecutiveRegsLast()) + ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + // If we overran the argument save area, we must use memory + // (this check catches arguments passed partially in memory) + if (ArgOffset > LinkageSize + ParamAreaSize) + UseMemory = true; + + // However, if the argument is actually passed in an FPR or a VR, + // we don't use memory after all. + if (!Flags.isByVal()) { + if (ArgVT == MVT::f32 || ArgVT == MVT::f64) + if (AvailableFPRs > 0) { + --AvailableFPRs; + return false; + } + if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || + ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || + ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) + if (AvailableVRs > 0) { + --AvailableVRs; + return false; + } + } + + return UseMemory; +} + /// EnsureStackAlignment - Round stack frame size up from NumBytes to /// ensure minimum alignment required for target. static unsigned EnsureStackAlignment(const TargetMachine &Target, unsigned NumBytes) { - unsigned TargetAlign = Target.getFrameLowering()->getStackAlignment(); + unsigned TargetAlign = + Target.getSubtargetImpl()->getFrameLowering()->getStackAlignment(); unsigned AlignMask = TargetAlign - 1; NumBytes = (NumBytes + AlignMask) & ~AlignMask; return NumBytes; @@ -2240,11 +2386,11 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( // Assign locations to all of the incoming arguments. SmallVector ArgLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); // Reserve space for the linkage area on the stack. - unsigned LinkageSize = PPCFrameLowering::getLinkageSize(false, false); + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(false, false, false); CCInfo.AllocateStack(LinkageSize, PtrByteSize); CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4); @@ -2315,7 +2461,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( // caller's stack frame, right above the parameter list area. SmallVector ByValArgLocs; CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ByValArgLocs, *DAG.getContext()); + ByValArgLocs, *DAG.getContext()); // Reserve stack space for the allocations in CCInfo. CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); @@ -2437,6 +2583,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( SmallVectorImpl &InVals) const { // TODO: add description of PPC stack frame format, or at least some docs. // + bool isELFv2ABI = Subtarget.isELFv2ABI(); bool isLittleEndian = Subtarget.isLittleEndian(); MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -2448,8 +2595,8 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( (CallConv == CallingConv::Fast)); unsigned PtrByteSize = 8; - unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false); - unsigned ArgOffset = LinkageSize; + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false, + isELFv2ABI); static const MCPhysReg GPR[] = { PPC::X3, PPC::X4, PPC::X5, PPC::X6, @@ -2471,12 +2618,29 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( const unsigned Num_FPR_Regs = 13; const unsigned Num_VR_Regs = array_lengthof(VR); - unsigned GPR_idx, FPR_idx = 0, VR_idx = 0; + // Do a first pass over the arguments to determine whether the ABI + // guarantees that our caller has allocated the parameter save area + // on its stack frame. In the ELFv1 ABI, this is always the case; + // in the ELFv2 ABI, it is true if this is a vararg function or if + // any parameter is located in a stack slot. + + bool HasParameterArea = !isELFv2ABI || isVarArg; + unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize; + unsigned NumBytes = LinkageSize; + unsigned AvailableFPRs = Num_FPR_Regs; + unsigned AvailableVRs = Num_VR_Regs; + for (unsigned i = 0, e = Ins.size(); i != e; ++i) + if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags, + PtrByteSize, LinkageSize, ParamAreaSize, + NumBytes, AvailableFPRs, AvailableVRs)) + HasParameterArea = true; // Add DAG nodes to load the arguments or copy them out of registers. On // entry to a function on PPC, the arguments start after the linkage area, // although the first ones are often in registers. + unsigned ArgOffset = LinkageSize; + unsigned GPR_idx, FPR_idx = 0, VR_idx = 0; SmallVector MemOps; Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin(); unsigned CurArgIdx = 0; @@ -2484,6 +2648,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( SDValue ArgVal; bool needsLoad = false; EVT ObjectVT = Ins[ArgNo].VT; + EVT OrigVT = Ins[ArgNo].ArgVT; unsigned ObjSize = ObjectVT.getStoreSize(); unsigned ArgSize = ObjSize; ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; @@ -2492,7 +2657,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( /* Respect alignment of argument on the stack. */ unsigned Align = - CalculateStackSlotAlignment(ObjectVT, Flags, PtrByteSize); + CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize); ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; unsigned CurArgOffset = ArgOffset; @@ -2520,15 +2685,31 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( continue; } - // All aggregates smaller than 8 bytes must be passed right-justified. - if (ObjSize < PtrByteSize && !isLittleEndian) - CurArgOffset = CurArgOffset + (PtrByteSize - ObjSize); - // The value of the object is its address. - int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true); + // Create a stack object covering all stack doublewords occupied + // by the argument. If the argument is (fully or partially) on + // the stack, or if the argument is fully in registers but the + // caller has allocated the parameter save anyway, we can refer + // directly to the caller's stack frame. Otherwise, create a + // local copy in our own frame. + int FI; + if (HasParameterArea || + ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize) + FI = MFI->CreateFixedObject(ArgSize, ArgOffset, false); + else + FI = MFI->CreateStackObject(ArgSize, Align, false); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - InVals.push_back(FIN); - if (ObjSize < 8) { + // Handle aggregates smaller than 8 bytes. + if (ObjSize < PtrByteSize) { + // The value of the object is its address, which differs from the + // address of the enclosing doubleword on big-endian systems. + SDValue Arg = FIN; + if (!isLittleEndian) { + SDValue ArgOff = DAG.getConstant(PtrByteSize - ObjSize, PtrVT); + Arg = DAG.getNode(ISD::ADD, dl, ArgOff.getValueType(), Arg, ArgOff); + } + InVals.push_back(Arg); + if (GPR_idx != Num_GPR_Regs) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); @@ -2537,18 +2718,13 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( if (ObjSize==1 || ObjSize==2 || ObjSize==4) { EVT ObjType = (ObjSize == 1 ? MVT::i8 : (ObjSize == 2 ? MVT::i16 : MVT::i32)); - Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN, + Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg, MachinePointerInfo(FuncArg), ObjType, false, false, 0); } else { // For sizes that don't fit a truncating store (3, 5, 6, 7), // store the whole register as-is to the parameter save area - // slot. The address of the parameter was already calculated - // above (InVals.push_back(FIN)) to be the right-justified - // offset within the slot. For this store, we need a new - // frame index that points at the beginning of the slot. - int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true); - SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + // slot. Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo(FuncArg), false, false, 0); @@ -2562,27 +2738,29 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( continue; } + // The value of the object is its address, which is the address of + // its first stack doubleword. + InVals.push_back(FIN); + + // Store whatever pieces of the object are in registers to memory. for (unsigned j = 0; j < ArgSize; j += PtrByteSize) { - // Store whatever pieces of the object are in registers - // to memory. ArgOffset will be the address of the beginning - // of the object. - if (GPR_idx != Num_GPR_Regs) { - unsigned VReg; - VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); - int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true); - SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); - SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo(FuncArg, j), - false, false, 0); - MemOps.push_back(Store); - ++GPR_idx; - ArgOffset += PtrByteSize; - } else { - ArgOffset += ArgSize - j; + if (GPR_idx == Num_GPR_Regs) break; + + unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); + SDValue Addr = FIN; + if (j) { + SDValue Off = DAG.getConstant(j, PtrVT); + Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off); } + SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr, + MachinePointerInfo(FuncArg, j), + false, false, 0); + MemOps.push_back(Store); + ++GPR_idx; } + ArgOffset += ArgSize; continue; } @@ -2591,6 +2769,9 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( case MVT::i1: case MVT::i32: case MVT::i64: + // These can be scalar arguments or elements of an integer array type + // passed directly. Clang may use those instead of "byval" aggregate + // types to avoid forcing arguments to memory unnecessarily. if (GPR_idx != Num_GPR_Regs) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); @@ -2608,6 +2789,9 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( case MVT::f32: case MVT::f64: + // These can be scalar arguments or elements of a float array type + // passed directly. The latter are used to implement ELFv2 homogenous + // float aggregates. if (FPR_idx != Num_FPR_Regs) { unsigned VReg; @@ -2620,12 +2804,32 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); ++FPR_idx; + } else if (GPR_idx != Num_GPR_Regs) { + // This can only ever happen in the presence of f32 array types, + // since otherwise we never run out of FPRs before running out + // of GPRs. + unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); + + if (ObjectVT == MVT::f32) { + if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0)) + ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal, + DAG.getConstant(32, MVT::i32)); + ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal); + } + + ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal); } else { needsLoad = true; - ArgSize = PtrByteSize; } - ArgOffset += 8; + // When passing an array of floats, the array occupies consecutive + // space in the argument area; only round up to the next doubleword + // at the end of the array. Otherwise, each float takes 8 bytes. + ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize; + ArgOffset += ArgSize; + if (Flags.isInConsecutiveRegsLast()) + ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; break; case MVT::v4f32: case MVT::v4i32: @@ -2633,6 +2837,9 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( case MVT::v16i8: case MVT::v2f64: case MVT::v2i64: + // These can be scalar arguments or elements of a vector array type + // passed directly. The latter are used to implement ELFv2 homogenous + // vector aggregates. if (VR_idx != Num_VR_Regs) { unsigned VReg = (ObjectVT == MVT::v2f64 || ObjectVT == MVT::v2i64) ? MF.addLiveIn(VSRH[VR_idx], &PPC::VSHRCRegClass) : @@ -2662,7 +2869,10 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( // Area that is at least reserved in the caller of this function. unsigned MinReservedArea; - MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize); + if (HasParameterArea) + MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize); + else + MinReservedArea = LinkageSize; // Set the size that is at least reserved in caller of this function. Tail // call optimized functions' reserved stack space needs to be aligned so that @@ -2723,7 +2933,8 @@ PPCTargetLowering::LowerFormalArguments_Darwin( (CallConv == CallingConv::Fast)); unsigned PtrByteSize = isPPC64 ? 8 : 4; - unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true); + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true, + false); unsigned ArgOffset = LinkageSize; // Area that is at least reserved in caller of this function. unsigned MinReservedArea = ArgOffset; @@ -3336,6 +3547,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, bool isPPC64 = Subtarget.isPPC64(); bool isSVR4ABI = Subtarget.isSVR4ABI(); + bool isELFv2ABI = Subtarget.isELFv2ABI(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); NodeTys.push_back(MVT::Other); // Returns a chain @@ -3357,15 +3569,18 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, // far-call stubs may be outside relocation limits for a BL instruction. if (!DAG.getTarget().getSubtarget().isJITCodeModel()) { unsigned OpFlags = 0; - if (DAG.getTarget().getRelocationModel() != Reloc::Static && + if ((DAG.getTarget().getRelocationModel() != Reloc::Static && (Subtarget.getTargetTriple().isMacOSX() && Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5)) && (G->getGlobal()->isDeclaration() || - G->getGlobal()->isWeakForLinker())) { + G->getGlobal()->isWeakForLinker())) || + (Subtarget.isTargetELF() && !isPPC64 && + !G->getGlobal()->hasLocalLinkage() && + DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. - OpFlags = PPCII::MO_DARWIN_STUB; + OpFlags = PPCII::MO_PLT_OR_STUB; } // If the callee is a GlobalAddress/ExternalSymbol node (quite common, @@ -3381,13 +3596,15 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { unsigned char OpFlags = 0; - if (DAG.getTarget().getRelocationModel() != Reloc::Static && - (Subtarget.getTargetTriple().isMacOSX() && - Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5))) { + if ((DAG.getTarget().getRelocationModel() != Reloc::Static && + (Subtarget.getTargetTriple().isMacOSX() && + Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5))) || + (Subtarget.isTargetELF() && !isPPC64 && + DAG.getTarget().getRelocationModel() == Reloc::PIC_) ) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. - OpFlags = PPCII::MO_DARWIN_STUB; + OpFlags = PPCII::MO_PLT_OR_STUB; } Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType(), @@ -3400,7 +3617,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, // to do the call, we can't use PPCISD::CALL. SDValue MTCTROps[] = {Chain, Callee, InFlag}; - if (isSVR4ABI && isPPC64) { + if (isSVR4ABI && isPPC64 && !isELFv2ABI) { // Function pointers in the 64-bit SVR4 ABI do not point to the function // entry point, but to the function descriptor (the function entry point // address is part of the function descriptor though). @@ -3480,7 +3697,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, CallOpc = PPCISD::BCTRL; Callee.setNode(nullptr); // Add use of X11 (holding environment pointer) - if (isSVR4ABI && isPPC64) + if (isSVR4ABI && isPPC64 && !isELFv2ABI) Ops.push_back(DAG.getRegister(PPC::X11, PtrVT)); // Add CTR register as callee so a bctr can be emitted later. if (isTailCall) @@ -3502,6 +3719,10 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, Ops.push_back(DAG.getRegister(RegsToPass[i].first, RegsToPass[i].second.getValueType())); + // Direct calls in the ELFv2 ABI need the TOC register live into the call. + if (Callee.getNode() && isELFv2ABI) + Ops.push_back(DAG.getRegister(PPC::X2, PtrVT)); + return CallOpc; } @@ -3522,8 +3743,8 @@ PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, SmallVectorImpl &InVals) const { SmallVector RVLocs; - CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC); // Copy all of the result registers out of their specified physreg. @@ -3571,6 +3792,8 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, int SPDiff, unsigned NumBytes, const SmallVectorImpl &Ins, SmallVectorImpl &InVals) const { + + bool isELFv2ABI = Subtarget.isELFv2ABI(); std::vector NodeTys; SmallVector Ops; unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, dl, SPDiff, @@ -3589,7 +3812,8 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0; // Add a register mask operand representing the call-preserved registers. - const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const TargetRegisterInfo *TRI = + getTargetMachine().getSubtargetImpl()->getRegisterInfo(); const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -3646,7 +3870,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT); - unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(); + unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(isELFv2ABI); SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset); SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff); Chain = DAG.getNode(PPCISD::LOAD_TOC, dl, VTs, Chain, AddTOC, InFlag); @@ -3735,11 +3959,12 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee, // Assign locations to all of the outgoing arguments. SmallVector ArgLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); // Reserve space for the linkage area on the stack. - CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize); + CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false, false), + PtrByteSize); if (isVarArg) { // Handle fixed and variable vector arguments differently. @@ -3776,7 +4001,7 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee, // Assign locations to all of the outgoing aggregate by value arguments. SmallVector ByValArgLocs; CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ByValArgLocs, *DAG.getContext()); + ByValArgLocs, *DAG.getContext()); // Reserve stack space for the allocations in CCInfo. CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); @@ -3948,6 +4173,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { + bool isELFv2ABI = Subtarget.isELFv2ABI(); bool isLittleEndian = Subtarget.isLittleEndian(); unsigned NumOps = Outs.size(); @@ -3966,21 +4192,27 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, MF.getInfo()->setHasFastCall(); // Count how many bytes are to be pushed on the stack, including the linkage - // area, and parameter passing area. We start with at least 48 bytes, which - // is reserved space for [SP][CR][LR][3 x unused]. - unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false); + // area, and parameter passing area. On ELFv1, the linkage area is 48 bytes + // reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage + // area is 32 bytes reserved space for [SP][CR][LR][TOC]. + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false, + isELFv2ABI); unsigned NumBytes = LinkageSize; // Add up all the space actually used. for (unsigned i = 0; i != NumOps; ++i) { ISD::ArgFlagsTy Flags = Outs[i].Flags; EVT ArgVT = Outs[i].VT; + EVT OrigVT = Outs[i].ArgVT; /* Respect alignment of argument on the stack. */ - unsigned Align = CalculateStackSlotAlignment(ArgVT, Flags, PtrByteSize); + unsigned Align = + CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); NumBytes = ((NumBytes + Align - 1) / Align) * Align; NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); + if (Flags.isInConsecutiveRegsLast()) + NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; } unsigned NumBytesActuallyUsed = NumBytes; @@ -3990,6 +4222,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, // Because we cannot tell if this is needed on the caller side, we have to // conservatively assume that it is needed. As such, make sure we have at // least enough stack space for the caller to store the 8 GPRs. + // FIXME: On ELFv2, it may be unnecessary to allocate the parameter area. NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); // Tail call needs the stack to be aligned. @@ -4056,10 +4289,12 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, for (unsigned i = 0; i != NumOps; ++i) { SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; + EVT ArgVT = Outs[i].VT; + EVT OrigVT = Outs[i].ArgVT; /* Respect alignment of argument on the stack. */ unsigned Align = - CalculateStackSlotAlignment(Outs[i].VT, Flags, PtrByteSize); + CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; /* Compute GPR index associated with argument offset. */ @@ -4103,7 +4338,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, if (GPR_idx != NumGPRs) { SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, MachinePointerInfo(), VT, - false, false, 0); + false, false, false, 0); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load)); @@ -4199,6 +4434,9 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, case MVT::i1: case MVT::i32: case MVT::i64: + // These can be scalar arguments or elements of an integer array type + // passed directly. Clang may use those instead of "byval" aggregate + // types to avoid forcing arguments to memory unnecessarily. if (GPR_idx != NumGPRs) { RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Arg)); } else { @@ -4209,39 +4447,70 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, ArgOffset += PtrByteSize; break; case MVT::f32: - case MVT::f64: - if (FPR_idx != NumFPRs) { + case MVT::f64: { + // These can be scalar arguments or elements of a float array type + // passed directly. The latter are used to implement ELFv2 homogenous + // float aggregates. + + // Named arguments go into FPRs first, and once they overflow, the + // remaining arguments go into GPRs and then the parameter save area. + // Unnamed arguments for vararg functions always go to GPRs and + // then the parameter save area. For now, put all arguments to vararg + // routines always in both locations (FPR *and* GPR or stack slot). + bool NeedGPROrStack = isVarArg || FPR_idx == NumFPRs; + + // First load the argument into the next available FPR. + if (FPR_idx != NumFPRs) RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg)); - if (isVarArg) { - // A single float or an aggregate containing only a single float - // must be passed right-justified in the stack doubleword, and - // in the GPR, if one is available. - SDValue StoreOff; - if (Arg.getSimpleValueType().SimpleTy == MVT::f32 && - !isLittleEndian) { - SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType()); - StoreOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); - } else - StoreOff = PtrOff; - - SDValue Store = DAG.getStore(Chain, dl, Arg, StoreOff, - MachinePointerInfo(), false, false, 0); - MemOpChains.push_back(Store); - - // Float varargs are always shadowed in available integer registers - if (GPR_idx != NumGPRs) { - SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff, - MachinePointerInfo(), false, false, - false, 0); - MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load)); - } - } + // Next, load the argument into GPR or stack slot if needed. + if (!NeedGPROrStack) + ; + else if (GPR_idx != NumGPRs) { + // In the non-vararg case, this can only ever happen in the + // presence of f32 array types, since otherwise we never run + // out of FPRs before running out of GPRs. + SDValue ArgVal; + + // Double values are always passed in a single GPR. + if (Arg.getValueType() != MVT::f32) { + ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); + + // Non-array float values are extended and passed in a GPR. + } else if (!Flags.isInConsecutiveRegs()) { + ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); + ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal); + + // If we have an array of floats, we collect every odd element + // together with its predecessor into one GPR. + } else if (ArgOffset % PtrByteSize != 0) { + SDValue Lo, Hi; + Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]); + Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); + if (!isLittleEndian) + std::swap(Lo, Hi); + ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); + + // The final element, if even, goes into the first half of a GPR. + } else if (Flags.isInConsecutiveRegsLast()) { + ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); + ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal); + if (!isLittleEndian) + ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal, + DAG.getConstant(32, MVT::i32)); + + // Non-final even elements are skipped; they will be handled + // together the with subsequent argument on the next go-around. + } else + ArgVal = SDValue(); + + if (ArgVal.getNode()) + RegsToPass.push_back(std::make_pair(GPR[GPR_idx], ArgVal)); } else { // Single-precision floating-point values are mapped to the // second (rightmost) word of the stack doubleword. - if (Arg.getValueType() == MVT::f32 && !isLittleEndian) { + if (Arg.getValueType() == MVT::f32 && + !isLittleEndian && !Flags.isInConsecutiveRegs()) { SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType()); PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); } @@ -4250,14 +4519,25 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, true, isTailCall, false, MemOpChains, TailCallArguments, dl); } - ArgOffset += 8; + // When passing an array of floats, the array occupies consecutive + // space in the argument area; only round up to the next doubleword + // at the end of the array. Otherwise, each float takes 8 bytes. + ArgOffset += (Arg.getValueType() == MVT::f32 && + Flags.isInConsecutiveRegs()) ? 4 : 8; + if (Flags.isInConsecutiveRegsLast()) + ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; break; + } case MVT::v4f32: case MVT::v4i32: case MVT::v8i16: case MVT::v16i8: case MVT::v2f64: case MVT::v2i64: + // These can be scalar arguments or elements of a vector array type + // passed directly. The latter are used to implement ELFv2 homogenous + // vector aggregates. + // For a varargs call, named arguments go into VRs or on the stack as // usual; unnamed arguments always go to the stack or the corresponding // GPRs when within range. For now, we always put the value in both @@ -4328,11 +4608,16 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, // Load r2 into a virtual register and store it to the TOC save area. SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64); // TOC save area offset. - unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(); + unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(isELFv2ABI); SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset); SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, MachinePointerInfo(), false, false, 0); + // In the ELFv2 ABI, R12 must contain the address of an indirect callee. + // This does not mean the MTCTR instruction must use R12; it's easier + // to model this as an extra parameter, so do that. + if (isELFv2ABI) + RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee)); } // Build a sequence of copy-to-reg nodes chained together with token chain @@ -4383,7 +4668,8 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, // Count how many bytes are to be pushed on the stack, including the linkage // area, and parameter passing area. We start with 24/48 bytes, which is // prereserved space for [SP][CR][LR][3 x unused]. - unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true); + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true, + false); unsigned NumBytes = LinkageSize; // Add up all the space actually used. @@ -4522,7 +4808,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, if (GPR_idx != NumGPRs) { SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, MachinePointerInfo(), VT, - false, false, 0); + false, false, false, 0); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); @@ -4751,8 +5037,7 @@ PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv, const SmallVectorImpl &Outs, LLVMContext &Context) const { SmallVector RVLocs; - CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), - RVLocs, Context); + CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC_PPC); } @@ -4764,8 +5049,8 @@ PPCTargetLowering::LowerReturn(SDValue Chain, SDLoc dl, SelectionDAG &DAG) const { SmallVector RVLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_PPC); SDValue Flag; @@ -5773,15 +6058,15 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, if (PPC::isSplatShuffleMask(SVOp, 1) || PPC::isSplatShuffleMask(SVOp, 2) || PPC::isSplatShuffleMask(SVOp, 4) || - PPC::isVPKUWUMShuffleMask(SVOp, true, DAG) || - PPC::isVPKUHUMShuffleMask(SVOp, true, DAG) || - PPC::isVSLDOIShuffleMask(SVOp, true, DAG) != -1 || - PPC::isVMRGLShuffleMask(SVOp, 1, true, DAG) || - PPC::isVMRGLShuffleMask(SVOp, 2, true, DAG) || - PPC::isVMRGLShuffleMask(SVOp, 4, true, DAG) || - PPC::isVMRGHShuffleMask(SVOp, 1, true, DAG) || - PPC::isVMRGHShuffleMask(SVOp, 2, true, DAG) || - PPC::isVMRGHShuffleMask(SVOp, 4, true, DAG)) { + PPC::isVPKUWUMShuffleMask(SVOp, 1, DAG) || + PPC::isVPKUHUMShuffleMask(SVOp, 1, DAG) || + PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) != -1 || + PPC::isVMRGLShuffleMask(SVOp, 1, 1, DAG) || + PPC::isVMRGLShuffleMask(SVOp, 2, 1, DAG) || + PPC::isVMRGLShuffleMask(SVOp, 4, 1, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 1, 1, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 2, 1, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 4, 1, DAG)) { return Op; } } @@ -5789,15 +6074,16 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, // Altivec has a variety of "shuffle immediates" that take two vector inputs // and produce a fixed permutation. If any of these match, do not lower to // VPERM. - if (PPC::isVPKUWUMShuffleMask(SVOp, false, DAG) || - PPC::isVPKUHUMShuffleMask(SVOp, false, DAG) || - PPC::isVSLDOIShuffleMask(SVOp, false, DAG) != -1 || - PPC::isVMRGLShuffleMask(SVOp, 1, false, DAG) || - PPC::isVMRGLShuffleMask(SVOp, 2, false, DAG) || - PPC::isVMRGLShuffleMask(SVOp, 4, false, DAG) || - PPC::isVMRGHShuffleMask(SVOp, 1, false, DAG) || - PPC::isVMRGHShuffleMask(SVOp, 2, false, DAG) || - PPC::isVMRGHShuffleMask(SVOp, 4, false, DAG)) + unsigned int ShuffleKind = isLittleEndian ? 2 : 0; + if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) || + PPC::isVPKUHUMShuffleMask(SVOp, ShuffleKind, DAG) || + PPC::isVSLDOIShuffleMask(SVOp, ShuffleKind, DAG) != -1 || + PPC::isVMRGLShuffleMask(SVOp, 1, ShuffleKind, DAG) || + PPC::isVMRGLShuffleMask(SVOp, 2, ShuffleKind, DAG) || + PPC::isVMRGLShuffleMask(SVOp, 4, ShuffleKind, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 1, ShuffleKind, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 2, ShuffleKind, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 4, ShuffleKind, DAG)) return Op; // Check to see if this is a shuffle of 4-byte values. If so, we can use our @@ -6256,7 +6542,8 @@ MachineBasicBlock * PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, bool is64bit, unsigned BinOpcode) const { // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = + getTargetMachine().getSubtargetImpl()->getInstrInfo(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction *F = BB->getParent(); @@ -6318,7 +6605,8 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI, bool is8bit, // operation unsigned BinOpcode) const { // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = + getTargetMachine().getSubtargetImpl()->getInstrInfo(); // In 64 bit mode we have to use 64 bits for addresses, even though the // lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address // registers without caring whether they're 32 or 64, but here we're @@ -6446,7 +6734,8 @@ llvm::MachineBasicBlock* PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = + getTargetMachine().getSubtargetImpl()->getInstrInfo(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -6545,7 +6834,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, // Setup MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB); const PPCRegisterInfo *TRI = - static_cast(getTargetMachine().getRegisterInfo()); + getTargetMachine().getSubtarget().getRegisterInfo(); MIB.addRegMask(TRI->getNoPreservedMask()); BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1); @@ -6594,7 +6883,8 @@ MachineBasicBlock * PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = + getTargetMachine().getSubtargetImpl()->getInstrInfo(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -6613,7 +6903,10 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, // Since FP is only updated here but NOT referenced, it's treated as GPR. unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31; unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1; - unsigned BP = (PVT == MVT::i64) ? PPC::X30 : PPC::R30; + unsigned BP = (PVT == MVT::i64) ? PPC::X30 : + (Subtarget.isSVR4ABI() && + MF->getTarget().getRelocationModel() == Reloc::PIC_ ? + PPC::R29 : PPC::R30); MachineInstrBuilder MIB; @@ -6703,7 +6996,8 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, return emitEHSjLjLongJmp(MI, BB); } - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = + getTargetMachine().getSubtargetImpl()->getInstrInfo(); // To "insert" these instructions we actually have to insert their // control-flow patterns. @@ -6726,7 +7020,8 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, Cond.push_back(MI->getOperand(1)); DebugLoc dl = MI->getDebugLoc(); - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = + getTargetMachine().getSubtargetImpl()->getInstrInfo(); TII->insertSelect(*BB, MI, dl, MI->getOperand(0).getReg(), Cond, MI->getOperand(2).getReg(), MI->getOperand(3).getReg()); @@ -6833,13 +7128,13 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, BB = EmitAtomicBinary(MI, BB, true, PPC::XOR8); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I8) - BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ANDC); + BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I16) - BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ANDC); + BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I32) - BB = EmitAtomicBinary(MI, BB, false, PPC::ANDC); + BB = EmitAtomicBinary(MI, BB, false, PPC::NAND); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I64) - BB = EmitAtomicBinary(MI, BB, true, PPC::ANDC8); + BB = EmitAtomicBinary(MI, BB, true, PPC::NAND8); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF); @@ -7266,16 +7561,12 @@ SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op, return SDValue(); } -// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does -// not enforce equality of the chain operands. -static bool isConsecutiveLS(LSBaseSDNode *LS, LSBaseSDNode *Base, +static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base, unsigned Bytes, int Dist, SelectionDAG &DAG) { - EVT VT = LS->getMemoryVT(); if (VT.getSizeInBits() / 8 != Bytes) return false; - SDValue Loc = LS->getBasePtr(); SDValue BaseLoc = Base->getBasePtr(); if (Loc.getOpcode() == ISD::FrameIndex) { if (BaseLoc.getOpcode() != ISD::FrameIndex) @@ -7306,11 +7597,69 @@ static bool isConsecutiveLS(LSBaseSDNode *LS, LSBaseSDNode *Base, return false; } +// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does +// not enforce equality of the chain operands. +static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base, + unsigned Bytes, int Dist, + SelectionDAG &DAG) { + if (LSBaseSDNode *LS = dyn_cast(N)) { + EVT VT = LS->getMemoryVT(); + SDValue Loc = LS->getBasePtr(); + return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG); + } + + if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) { + EVT VT; + switch (cast(N->getOperand(1))->getZExtValue()) { + default: return false; + case Intrinsic::ppc_altivec_lvx: + case Intrinsic::ppc_altivec_lvxl: + VT = MVT::v4i32; + break; + case Intrinsic::ppc_altivec_lvebx: + VT = MVT::i8; + break; + case Intrinsic::ppc_altivec_lvehx: + VT = MVT::i16; + break; + case Intrinsic::ppc_altivec_lvewx: + VT = MVT::i32; + break; + } + + return isConsecutiveLSLoc(N->getOperand(2), VT, Base, Bytes, Dist, DAG); + } + + if (N->getOpcode() == ISD::INTRINSIC_VOID) { + EVT VT; + switch (cast(N->getOperand(1))->getZExtValue()) { + default: return false; + case Intrinsic::ppc_altivec_stvx: + case Intrinsic::ppc_altivec_stvxl: + VT = MVT::v4i32; + break; + case Intrinsic::ppc_altivec_stvebx: + VT = MVT::i8; + break; + case Intrinsic::ppc_altivec_stvehx: + VT = MVT::i16; + break; + case Intrinsic::ppc_altivec_stvewx: + VT = MVT::i32; + break; + } + + return isConsecutiveLSLoc(N->getOperand(3), VT, Base, Bytes, Dist, DAG); + } + + return false; +} + // Return true is there is a nearyby consecutive load to the one provided // (regardless of alignment). We search up and down the chain, looking though -// token factors and other loads (but nothing else). As a result, a true -// results indicates that it is safe to create a new consecutive load adjacent -// to the load provided. +// token factors and other loads (but nothing else). As a result, a true result +// indicates that it is safe to create a new consecutive load adjacent to the +// load provided. static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { SDValue Chain = LD->getChain(); EVT VT = LD->getMemoryVT(); @@ -7327,7 +7676,7 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { if (!Visited.insert(ChainNext)) continue; - if (LoadSDNode *ChainLD = dyn_cast(ChainNext)) { + if (MemSDNode *ChainLD = dyn_cast(ChainNext)) { if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG)) return true; @@ -7358,14 +7707,14 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { if (!Visited.insert(LoadRoot)) continue; - if (LoadSDNode *ChainLD = dyn_cast(LoadRoot)) + if (MemSDNode *ChainLD = dyn_cast(LoadRoot)) if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG)) return true; for (SDNode::use_iterator UI = LoadRoot->use_begin(), UE = LoadRoot->use_end(); UI != UE; ++UI) - if (((isa(*UI) && - cast(*UI)->getChain().getNode() == LoadRoot) || + if (((isa(*UI) && + cast(*UI)->getChain().getNode() == LoadRoot) || UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI)) Queue.push_back(*UI); } @@ -8149,17 +8498,25 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, Intrinsic::ppc_altivec_lvsl); SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, MVT::v16i8); - // Refine the alignment of the original load (a "new" load created here - // which was identical to the first except for the alignment would be - // merged with the existing node regardless). + // Create the new MMO for the new base load. It is like the original MMO, + // but represents an area in memory almost twice the vector size centered + // on the original address. If the address is unaligned, we might start + // reading up to (sizeof(vector)-1) bytes below the address of the + // original unaligned load. MachineFunction &MF = DAG.getMachineFunction(); - MachineMemOperand *MMO = - MF.getMachineMemOperand(LD->getPointerInfo(), - LD->getMemOperand()->getFlags(), - LD->getMemoryVT().getStoreSize(), - ABIAlignment); - LD->refineAlignment(MMO); - SDValue BaseLoad = SDValue(LD, 0); + MachineMemOperand *BaseMMO = + MF.getMachineMemOperand(LD->getMemOperand(), + -LD->getMemoryVT().getStoreSize()+1, + 2*LD->getMemoryVT().getStoreSize()-1); + + // Create the new base load. + SDValue LDXIntID = DAG.getTargetConstant(Intrinsic::ppc_altivec_lvx, + getPointerTy()); + SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr }; + SDValue BaseLoad = + DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, + DAG.getVTList(MVT::v4i32, MVT::Other), + BaseLoadOps, MVT::v4i32, BaseMMO); // Note that the value of IncOffset (which is provided to the next // load's pointer info offset value, and thus used to calculate the @@ -8181,21 +8538,18 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, SDValue Increment = DAG.getConstant(IncValue, getPointerTy()); Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + MachineMemOperand *ExtraMMO = + MF.getMachineMemOperand(LD->getMemOperand(), + 1, 2*LD->getMemoryVT().getStoreSize()-1); + SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr }; SDValue ExtraLoad = - DAG.getLoad(VT, dl, Chain, Ptr, - LD->getPointerInfo().getWithOffset(IncOffset), - LD->isVolatile(), LD->isNonTemporal(), - LD->isInvariant(), ABIAlignment); + DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, + DAG.getVTList(MVT::v4i32, MVT::Other), + ExtraLoadOps, MVT::v4i32, ExtraMMO); SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, BaseLoad.getValue(1), ExtraLoad.getValue(1)); - if (BaseLoad.getValueType() != MVT::v4i32) - BaseLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, BaseLoad); - - if (ExtraLoad.getValueType() != MVT::v4i32) - ExtraLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ExtraLoad); - // Because vperm has a big-endian bias, we must reverse the order // of the input vectors and complement the permute control vector // when generating little endian code. We have already handled the @@ -8212,36 +8566,9 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, if (VT != MVT::v4i32) Perm = DAG.getNode(ISD::BITCAST, dl, VT, Perm); - // Now we need to be really careful about how we update the users of the - // original load. We cannot just call DCI.CombineTo (or - // DAG.ReplaceAllUsesWith for that matter), because the load still has - // uses created here (the permutation for example) that need to stay. - SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); - while (UI != UE) { - SDUse &Use = UI.getUse(); - SDNode *User = *UI; - // Note: BaseLoad is checked here because it might not be N, but a - // bitcast of N. - if (User == Perm.getNode() || User == BaseLoad.getNode() || - User == TF.getNode() || Use.getResNo() > 1) { - ++UI; - continue; - } - - SDValue To = Use.getResNo() ? TF : Perm; - ++UI; - - SmallVector Ops; - for (const SDUse &O : User->ops()) { - if (O == Use) - Ops.push_back(To); - else - Ops.push_back(O); - } - - DAG.UpdateNodeOperands(User, Ops); - } - + // The output of the permutation is our loaded result, the TokenFactor is + // our new chain. + DCI.CombineTo(N, Perm, TF); return SDValue(N, 0); } } @@ -8659,7 +8986,8 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, // the AsmName field from *RegisterInfo.td, then this would not be necessary. if (R.first && VT == MVT::i64 && Subtarget.isPPC64() && PPC::GPRCRegClass.contains(R.first)) { - const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const TargetRegisterInfo *TRI = + getTargetMachine().getSubtargetImpl()->getRegisterInfo(); return std::make_pair(TRI->getMatchingSuperReg(R.first, PPC::sub_32, &PPC::G8RCRegClass), &PPC::G8RCRegClass); @@ -8931,9 +9259,10 @@ bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const { return isInt<16>(Imm) || isUInt<16>(Imm); } -bool PPCTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, - unsigned, - bool *Fast) const { +bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned, + unsigned, + bool *Fast) const { if (DisablePPCUnaligned) return false;