#include "PPCISelLowering.h"
#include "MCTargetDesc/PPCPredicates.h"
+#include "PPCCallingConv.h"
#include "PPCMachineFunctionInfo.h"
#include "PPCPerfectShuffle.h"
#include "PPCTargetMachine.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/Target/TargetOptions.h"
using namespace llvm;
+// FIXME: Remove this once soft-float is supported.
+static cl::opt<bool> DisablePPCFloatInVariadic("disable-ppc-float-in-variadic",
+cl::desc("disable saving float registers for va_start on PPC"), cl::Hidden);
+
static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
// FIXME: Remove this once the bug has been fixed!
extern cl::opt<bool> ANDIGlueBug;
-static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
- // If it isn't a Mach-O file then it's going to be a linux ELF
- // object file.
- if (TT.isOSDarwin())
- return new TargetLoweringObjectFileMachO();
-
- return new PPC64LinuxTargetObjectFile();
-}
-
-PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
- : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))),
+PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM)
+ : TargetLowering(TM),
Subtarget(*TM.getSubtargetImpl()) {
- setPow2DivIsCheap();
-
// Use _setjmp/_longjmp instead of setjmp/longjmp.
setUseUnderscoreSetJmp(true);
setUseUnderscoreLongJmp(true);
addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
// PowerPC has an i16 but no i8 (or i1) SEXTLOAD
- setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
- setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Expand);
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
+ }
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
if (ANDIGlueBug)
setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
- setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
- setTruncStoreAction(MVT::i64, MVT::i1, Expand);
- setTruncStoreAction(MVT::i32, MVT::i1, Expand);
- setTruncStoreAction(MVT::i16, MVT::i1, Expand);
- setTruncStoreAction(MVT::i8, MVT::i1, Expand);
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+ setTruncStoreAction(VT, MVT::i1, Expand);
+ }
addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass);
}
if (Subtarget.hasAltivec()) {
// First set operation action for all vector types to expand. Then we
// will selectively turn on ones that can be effectively codegen'd.
- for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
- i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
- MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
-
+ for (MVT VT : MVT::vector_valuetypes()) {
// add/sub are legal for all supported vector VT's.
setOperationAction(ISD::ADD , VT, Legal);
setOperationAction(ISD::SUB , VT, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
setOperationAction(ISD::BUILD_VECTOR, VT, Expand);
+ setOperationAction(ISD::MULHU, VT, Expand);
+ setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
setOperationAction(ISD::UDIVREM, VT, Expand);
setOperationAction(ISD::VSELECT, VT, Expand);
setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
- for (unsigned j = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
- j <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++j) {
- MVT::SimpleValueType InnerVT = (MVT::SimpleValueType)j;
+ for (MVT InnerVT : MVT::vector_valuetypes()) {
setTruncStoreAction(VT, InnerVT, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
}
- setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
- setLoadExtAction(ISD::EXTLOAD, VT, Expand);
}
// We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
// Altivec does not contain unordered floating-point compare instructions
setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand);
setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand);
- setCondCodeAction(ISD::SETUGT, MVT::v4f32, Expand);
- setCondCodeAction(ISD::SETUGE, MVT::v4f32, Expand);
- setCondCodeAction(ISD::SETULT, MVT::v4f32, Expand);
- setCondCodeAction(ISD::SETULE, MVT::v4f32, Expand);
-
setCondCodeAction(ISD::SETO, MVT::v4f32, Expand);
setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand);
// Share the Altivec comparison restrictions.
setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand);
setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand);
- setCondCodeAction(ISD::SETUGT, MVT::v2f64, Expand);
- setCondCodeAction(ISD::SETUGE, MVT::v2f64, Expand);
- setCondCodeAction(ISD::SETULT, MVT::v2f64, Expand);
- setCondCodeAction(ISD::SETULE, MVT::v2f64, Expand);
-
setCondCodeAction(ISD::SETO, MVT::v2f64, Expand);
setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand);
}
}
- if (Subtarget.has64BitSupport()) {
+ if (Subtarget.has64BitSupport())
setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
- setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
- }
- setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Expand);
- setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand);
- setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand);
- setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
+ setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, isPPC64 ? Legal : Custom);
+
+ if (!isPPC64) {
+ setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand);
+ setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
+ }
setBooleanContents(ZeroOrOneBooleanContent);
// Altivec instructions set fields to all zeros or all ones.
setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+ if (!isPPC64) {
+ // These libcalls are not available in 32-bit.
+ setLibcallName(RTLIB::SHL_I128, nullptr);
+ setLibcallName(RTLIB::SRL_I128, nullptr);
+ setLibcallName(RTLIB::SRA_I128, nullptr);
+ }
+
if (isPPC64) {
setStackPointerRegisterToSaveRestore(PPC::X1);
setExceptionPointerRegister(PPC::X3);
// We have target-specific dag combine patterns for the following nodes:
setTargetDAGCombine(ISD::SINT_TO_FP);
+ if (Subtarget.hasFPCVT())
+ setTargetDAGCombine(ISD::UINT_TO_FP);
setTargetDAGCombine(ISD::LOAD);
setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::BR_CC);
setTargetDAGCombine(ISD::BRCOND);
setTargetDAGCombine(ISD::BSWAP);
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+ setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+ setTargetDAGCombine(ISD::INTRINSIC_VOID);
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::ZERO_EXTEND);
if (Subtarget.isDarwin())
setPrefFunctionAlignment(4);
- if (isPPC64 && Subtarget.isJITCodeModel())
- // Temporary workaround for the inability of PPC64 JIT to handle jump
- // tables.
- setSupportJumpTables(false);
+ switch (Subtarget.getDarwinDirective()) {
+ default: break;
+ case PPC::DIR_970:
+ case PPC::DIR_A2:
+ case PPC::DIR_E500mc:
+ case PPC::DIR_E5500:
+ case PPC::DIR_PWR4:
+ case PPC::DIR_PWR5:
+ case PPC::DIR_PWR5X:
+ case PPC::DIR_PWR6:
+ case PPC::DIR_PWR6X:
+ case PPC::DIR_PWR7:
+ case PPC::DIR_PWR8:
+ setPrefFunctionAlignment(4);
+ setPrefLoopAlignment(4);
+ break;
+ }
setInsertFencesForAtomic(true);
computeRegisterProperties();
- // The Freescale cores does better with aggressive inlining of memcpy and
- // friends. Gcc uses same threshold of 128 bytes (= 32 word stores).
+ // The Freescale cores do better with aggressive inlining of memcpy and
+ // friends. GCC uses same threshold of 128 bytes (= 32 word stores).
if (Subtarget.getDarwinDirective() == PPC::DIR_E500mc ||
Subtarget.getDarwinDirective() == PPC::DIR_E5500) {
MaxStoresPerMemset = 32;
MaxStoresPerMemcpyOptSize = 8;
MaxStoresPerMemmove = 32;
MaxStoresPerMemmoveOptSize = 8;
-
- setPrefFunctionAlignment(4);
}
}
default: return nullptr;
case PPCISD::FSEL: return "PPCISD::FSEL";
case PPCISD::FCFID: return "PPCISD::FCFID";
+ case PPCISD::FCFIDU: return "PPCISD::FCFIDU";
+ case PPCISD::FCFIDS: return "PPCISD::FCFIDS";
+ case PPCISD::FCFIDUS: return "PPCISD::FCFIDUS";
case PPCISD::FCTIDZ: return "PPCISD::FCTIDZ";
case PPCISD::FCTIWZ: return "PPCISD::FCTIWZ";
+ case PPCISD::FCTIDUZ: return "PPCISD::FCTIDUZ";
+ case PPCISD::FCTIWUZ: return "PPCISD::FCTIWUZ";
case PPCISD::FRE: return "PPCISD::FRE";
case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE";
case PPCISD::STFIWX: return "PPCISD::STFIWX";
case PPCISD::VMADDFP: return "PPCISD::VMADDFP";
case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP";
case PPCISD::VPERM: return "PPCISD::VPERM";
+ case PPCISD::CMPB: return "PPCISD::CMPB";
case PPCISD::Hi: return "PPCISD::Hi";
case PPCISD::Lo: return "PPCISD::Lo";
case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY";
case PPCISD::SHL: return "PPCISD::SHL";
case PPCISD::CALL: return "PPCISD::CALL";
case PPCISD::CALL_NOP: return "PPCISD::CALL_NOP";
+ case PPCISD::CALL_TLS: return "PPCISD::CALL_TLS";
+ case PPCISD::CALL_NOP_TLS: return "PPCISD::CALL_NOP_TLS";
case PPCISD::MTCTR: return "PPCISD::MTCTR";
case PPCISD::BCTRL: return "PPCISD::BCTRL";
+ case PPCISD::BCTRL_LOAD_TOC: return "PPCISD::BCTRL_LOAD_TOC";
case PPCISD::RET_FLAG: return "PPCISD::RET_FLAG";
+ case PPCISD::READ_TIME_BASE: return "PPCISD::READ_TIME_BASE";
case PPCISD::EH_SJLJ_SETJMP: return "PPCISD::EH_SJLJ_SETJMP";
case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP";
case PPCISD::MFOCRF: return "PPCISD::MFOCRF";
case PPCISD::VCMPo: return "PPCISD::VCMPo";
case PPCISD::LBRX: return "PPCISD::LBRX";
case PPCISD::STBRX: return "PPCISD::STBRX";
+ case PPCISD::LFIWAX: return "PPCISD::LFIWAX";
+ case PPCISD::LFIWZX: return "PPCISD::LFIWZX";
case PPCISD::LARX: return "PPCISD::LARX";
case PPCISD::STCX: return "PPCISD::STCX";
case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH";
case PPCISD::ADD_TLS: return "PPCISD::ADD_TLS";
case PPCISD::ADDIS_TLSGD_HA: return "PPCISD::ADDIS_TLSGD_HA";
case PPCISD::ADDI_TLSGD_L: return "PPCISD::ADDI_TLSGD_L";
- case PPCISD::GET_TLS_ADDR: return "PPCISD::GET_TLS_ADDR";
case PPCISD::ADDIS_TLSLD_HA: return "PPCISD::ADDIS_TLSLD_HA";
case PPCISD::ADDI_TLSLD_L: return "PPCISD::ADDI_TLSLD_L";
- case PPCISD::GET_TLSLD_ADDR: return "PPCISD::GET_TLSLD_ADDR";
case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA";
case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L";
case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT";
return VT.changeVectorElementTypeToInteger();
}
+bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const {
+ assert(VT.isFloatingPoint() && "Non-floating-point FMA?");
+ return true;
+}
+
//===----------------------------------------------------------------------===//
// Node matching predicates, for use by the tblgen matching code.
//===----------------------------------------------------------------------===//
/// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
/// VPKUHUM instruction.
-bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary,
+/// The ShuffleKind distinguishes between big-endian operations with
+/// two different inputs (0), either-endian operations with two identical
+/// inputs (1), and little-endian operantion with two different inputs (2).
+/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
+bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
SelectionDAG &DAG) {
- unsigned j = DAG.getTarget().getDataLayout()->isLittleEndian() ? 0 : 1;
- if (!isUnary) {
+ bool IsLE = DAG.getSubtarget().getDataLayout()->isLittleEndian();
+ if (ShuffleKind == 0) {
+ if (IsLE)
+ return false;
for (unsigned i = 0; i != 16; ++i)
- if (!isConstantOrUndef(N->getMaskElt(i), i*2+j))
+ if (!isConstantOrUndef(N->getMaskElt(i), i*2+1))
return false;
- } else {
+ } else if (ShuffleKind == 2) {
+ if (!IsLE)
+ return false;
+ for (unsigned i = 0; i != 16; ++i)
+ if (!isConstantOrUndef(N->getMaskElt(i), i*2))
+ return false;
+ } else if (ShuffleKind == 1) {
+ unsigned j = IsLE ? 0 : 1;
for (unsigned i = 0; i != 8; ++i)
if (!isConstantOrUndef(N->getMaskElt(i), i*2+j) ||
!isConstantOrUndef(N->getMaskElt(i+8), i*2+j))
/// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
/// VPKUWUM instruction.
-bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary,
+/// The ShuffleKind distinguishes between big-endian operations with
+/// two different inputs (0), either-endian operations with two identical
+/// inputs (1), and little-endian operantion with two different inputs (2).
+/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
+bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
SelectionDAG &DAG) {
- unsigned j, k;
- if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
- j = 0;
- k = 1;
- } else {
- j = 2;
- k = 3;
- }
- if (!isUnary) {
+ bool IsLE = DAG.getSubtarget().getDataLayout()->isLittleEndian();
+ if (ShuffleKind == 0) {
+ if (IsLE)
+ return false;
for (unsigned i = 0; i != 16; i += 2)
- if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) ||
- !isConstantOrUndef(N->getMaskElt(i+1), i*2+k))
+ if (!isConstantOrUndef(N->getMaskElt(i ), i*2+2) ||
+ !isConstantOrUndef(N->getMaskElt(i+1), i*2+3))
return false;
- } else {
+ } else if (ShuffleKind == 2) {
+ if (!IsLE)
+ return false;
+ for (unsigned i = 0; i != 16; i += 2)
+ if (!isConstantOrUndef(N->getMaskElt(i ), i*2) ||
+ !isConstantOrUndef(N->getMaskElt(i+1), i*2+1))
+ return false;
+ } else if (ShuffleKind == 1) {
+ unsigned j = IsLE ? 0 : 2;
for (unsigned i = 0; i != 8; i += 2)
- if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) ||
- !isConstantOrUndef(N->getMaskElt(i+1), i*2+k) ||
- !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) ||
- !isConstantOrUndef(N->getMaskElt(i+9), i*2+k))
+ if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) ||
+ !isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) ||
+ !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) ||
+ !isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1))
return false;
}
return true;
/// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
/// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes).
+/// The ShuffleKind distinguishes between big-endian merges with two
+/// different inputs (0), either-endian merges with two identical inputs (1),
+/// and little-endian merges with two different inputs (2). For the latter,
+/// the input operands are swapped (see PPCInstrAltivec.td).
bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
- bool isUnary, SelectionDAG &DAG) {
- if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
- if (!isUnary)
+ unsigned ShuffleKind, SelectionDAG &DAG) {
+ if (DAG.getSubtarget().getDataLayout()->isLittleEndian()) {
+ if (ShuffleKind == 1) // unary
+ return isVMerge(N, UnitSize, 0, 0);
+ else if (ShuffleKind == 2) // swapped
return isVMerge(N, UnitSize, 0, 16);
- return isVMerge(N, UnitSize, 0, 0);
+ else
+ return false;
} else {
- if (!isUnary)
+ if (ShuffleKind == 1) // unary
+ return isVMerge(N, UnitSize, 8, 8);
+ else if (ShuffleKind == 0) // normal
return isVMerge(N, UnitSize, 8, 24);
- return isVMerge(N, UnitSize, 8, 8);
+ else
+ return false;
}
}
/// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
/// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes).
+/// The ShuffleKind distinguishes between big-endian merges with two
+/// different inputs (0), either-endian merges with two identical inputs (1),
+/// and little-endian merges with two different inputs (2). For the latter,
+/// the input operands are swapped (see PPCInstrAltivec.td).
bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
- bool isUnary, SelectionDAG &DAG) {
- if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
- if (!isUnary)
+ unsigned ShuffleKind, SelectionDAG &DAG) {
+ if (DAG.getSubtarget().getDataLayout()->isLittleEndian()) {
+ if (ShuffleKind == 1) // unary
+ return isVMerge(N, UnitSize, 8, 8);
+ else if (ShuffleKind == 2) // swapped
return isVMerge(N, UnitSize, 8, 24);
- return isVMerge(N, UnitSize, 8, 8);
+ else
+ return false;
} else {
- if (!isUnary)
+ if (ShuffleKind == 1) // unary
+ return isVMerge(N, UnitSize, 0, 0);
+ else if (ShuffleKind == 0) // normal
return isVMerge(N, UnitSize, 0, 16);
- return isVMerge(N, UnitSize, 0, 0);
+ else
+ return false;
}
}
/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
/// amount, otherwise return -1.
-int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary, SelectionDAG &DAG) {
+/// The ShuffleKind distinguishes between big-endian operations with two
+/// different inputs (0), either-endian operations with two identical inputs
+/// (1), and little-endian operations with two different inputs (2). For the
+/// latter, the input operands are swapped (see PPCInstrAltivec.td).
+int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
+ SelectionDAG &DAG) {
if (N->getValueType(0) != MVT::v16i8)
return -1;
unsigned ShiftAmt = SVOp->getMaskElt(i);
if (ShiftAmt < i) return -1;
- if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
-
- ShiftAmt += i;
-
- if (!isUnary) {
- // Check the rest of the elements to see if they are consecutive.
- for (++i; i != 16; ++i)
- if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt - i))
- return -1;
- } else {
- // Check the rest of the elements to see if they are consecutive.
- for (++i; i != 16; ++i)
- if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt - i) & 15))
- return -1;
- }
-
- } else { // Big Endian
+ ShiftAmt -= i;
+ bool isLE = DAG.getTarget().getSubtargetImpl()->getDataLayout()->
+ isLittleEndian();
+
+ if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) {
+ // Check the rest of the elements to see if they are consecutive.
+ for (++i; i != 16; ++i)
+ if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
+ return -1;
+ } else if (ShuffleKind == 1) {
+ // Check the rest of the elements to see if they are consecutive.
+ for (++i; i != 16; ++i)
+ if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15))
+ return -1;
+ } else
+ return -1;
- ShiftAmt -= i;
+ if (ShuffleKind == 2 && isLE)
+ ShiftAmt = 16 - ShiftAmt;
- if (!isUnary) {
- // Check the rest of the elements to see if they are consecutive.
- for (++i; i != 16; ++i)
- if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
- return -1;
- } else {
- // Check the rest of the elements to see if they are consecutive.
- for (++i; i != 16; ++i)
- if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15))
- return -1;
- }
- }
return ShiftAmt;
}
SelectionDAG &DAG) {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
assert(isSplatShuffleMask(SVOp, EltSize));
- if (DAG.getTarget().getDataLayout()->isLittleEndian())
+ if (DAG.getSubtarget().getDataLayout()->isLittleEndian())
return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize);
else
return SVOp->getMaskElt(0) / EltSize;
if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
// If all of the bits are known zero on the LHS or RHS, the add won't
// carry.
- Base = N.getOperand(0);
+ if (FrameIndexSDNode *FI =
+ dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
+ Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+ fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
+ } else {
+ Base = N.getOperand(0);
+ }
Disp = DAG.getTargetConstant(imm, N.getValueType());
return true;
}
HiOpFlags = PPCII::MO_HA;
LoOpFlags = PPCII::MO_LO;
- // Don't use the pic base if not in PIC relocation model. Or if we are on a
- // non-darwin platform. We don't support PIC on other platforms yet.
- bool isPIC = TM.getRelocationModel() == Reloc::PIC_ &&
- TM.getSubtarget<PPCSubtarget>().isDarwin();
+ // Don't use the pic base if not in PIC relocation model.
+ bool isPIC = TM.getRelocationModel() == Reloc::PIC_;
+
if (isPIC) {
HiOpFlags |= PPCII::MO_PIC_FLAG;
LoOpFlags |= PPCII::MO_PIC_FLAG;
unsigned MOHiFlag, MOLoFlag;
bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
+
+ if (isPIC && Subtarget.isSVR4ABI()) {
+ SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(),
+ PPCII::MO_PIC_FLAG);
+ SDLoc DL(CP);
+ return DAG.getNode(PPCISD::TOC_ENTRY, DL, MVT::i32, GA,
+ DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT));
+ }
+
SDValue CPIHi =
DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOHiFlag);
SDValue CPILo =
unsigned MOHiFlag, MOLoFlag;
bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
+
+ if (isPIC && Subtarget.isSVR4ABI()) {
+ SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+ PPCII::MO_PIC_FLAG);
+ SDLoc DL(GA);
+ return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(JT), PtrVT, GA,
+ DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT));
+ }
+
SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag);
SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag);
return LowerLabelRef(JTIHi, JTILo, isPIC, DAG);
SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
SelectionDAG &DAG) const {
EVT PtrVT = Op.getValueType();
+ BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Op);
+ const BlockAddress *BA = BASDN->getBlockAddress();
- const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+ // 64-bit SVR4 ABI code is always position-independent.
+ // The actual BlockAddress is stored in the TOC.
+ if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
+ SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset());
+ return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(BASDN), MVT::i64, GA,
+ DAG.getRegister(PPC::X2, MVT::i64));
+ }
unsigned MOHiFlag, MOLoFlag;
bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
return LowerLabelRef(TgtBAHi, TgtBALo, isPIC, DAG);
}
+// Generate a call to __tls_get_addr for the given GOT entry Op.
+std::pair<SDValue,SDValue>
+PPCTargetLowering::lowerTLSCall(SDValue Op, SDLoc dl,
+ SelectionDAG &DAG) const {
+
+ Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ Entry.Node = Op;
+ Entry.Ty = IntPtrTy;
+ Args.push_back(Entry);
+
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+ .setCallee(CallingConv::C, IntPtrTy,
+ DAG.getTargetExternalSymbol("__tls_get_addr", getPointerTy()),
+ std::move(Args), 0);
+
+ return LowerCallTo(CLI);
+}
+
SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
SelectionDAG &DAG) const {
const GlobalValue *GV = GA->getGlobal();
EVT PtrVT = getPointerTy();
bool is64bit = Subtarget.isPPC64();
+ const Module *M = DAG.getMachineFunction().getFunction()->getParent();
+ PICLevel::Level picLevel = M->getPICLevel();
TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
}
if (Model == TLSModel::GeneralDynamic) {
- SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
- SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
- SDValue GOTEntryHi = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT,
- GOTReg, TGA);
+ SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+ PPCII::MO_TLSGD);
+ SDValue GOTPtr;
+ if (is64bit) {
+ SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
+ GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT,
+ GOTReg, TGA);
+ } else {
+ if (picLevel == PICLevel::Small)
+ GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
+ else
+ GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
+ }
SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSGD_L, dl, PtrVT,
- GOTEntryHi, TGA);
-
- // We need a chain node, and don't have one handy. The underlying
- // call has no side effects, so using the function entry node
- // suffices.
- SDValue Chain = DAG.getEntryNode();
- Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, GOTEntry);
- SDValue ParmReg = DAG.getRegister(PPC::X3, MVT::i64);
- SDValue TLSAddr = DAG.getNode(PPCISD::GET_TLS_ADDR, dl,
- PtrVT, ParmReg, TGA);
- // The return value from GET_TLS_ADDR really is in X3 already, but
- // some hacks are needed here to tie everything together. The extra
- // copies dissolve during subsequent transforms.
- Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, TLSAddr);
- return DAG.getCopyFromReg(Chain, dl, PPC::X3, PtrVT);
+ GOTPtr, TGA);
+ std::pair<SDValue, SDValue> CallResult = lowerTLSCall(GOTEntry, dl, DAG);
+ return CallResult.first;
}
if (Model == TLSModel::LocalDynamic) {
- SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
- SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
- SDValue GOTEntryHi = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT,
- GOTReg, TGA);
+ SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+ PPCII::MO_TLSLD);
+ SDValue GOTPtr;
+ if (is64bit) {
+ SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
+ GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT,
+ GOTReg, TGA);
+ } else {
+ if (picLevel == PICLevel::Small)
+ GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
+ else
+ GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
+ }
SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSLD_L, dl, PtrVT,
- GOTEntryHi, TGA);
-
- // We need a chain node, and don't have one handy. The underlying
- // call has no side effects, so using the function entry node
- // suffices.
- SDValue Chain = DAG.getEntryNode();
- Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, GOTEntry);
- SDValue ParmReg = DAG.getRegister(PPC::X3, MVT::i64);
- SDValue TLSAddr = DAG.getNode(PPCISD::GET_TLSLD_ADDR, dl,
- PtrVT, ParmReg, TGA);
- // The return value from GET_TLSLD_ADDR really is in X3 already, but
- // some hacks are needed here to tie everything together. The extra
- // copies dissolve during subsequent transforms.
- Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, TLSAddr);
+ GOTPtr, TGA);
+ std::pair<SDValue, SDValue> CallResult = lowerTLSCall(GOTEntry, dl, DAG);
+ SDValue TLSAddr = CallResult.first;
+ SDValue Chain = CallResult.second;
SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl, PtrVT,
- Chain, ParmReg, TGA);
+ Chain, TLSAddr, TGA);
return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA);
}
unsigned MOHiFlag, MOLoFlag;
bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag, GV);
+ if (isPIC && Subtarget.isSVR4ABI()) {
+ SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT,
+ GSDN->getOffset(),
+ PPCII::MO_PIC_FLAG);
+ return DAG.getNode(PPCISD::TOC_ENTRY, DL, MVT::i32, GA,
+ DAG.getNode(PPCISD::GlobalBaseReg, DL, MVT::i32));
+ }
+
SDValue GAHi =
DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag);
SDValue GALo =
// gpr_index
SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
VAListPtr, MachinePointerInfo(SV), MVT::i8,
- false, false, 0);
+ false, false, false, 0);
InChain = GprIndex.getValue(1);
if (VT == MVT::i64) {
// fpr
SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
FprPtr, MachinePointerInfo(SV), MVT::i8,
- false, false, 0);
+ false, false, false, 0);
InChain = FprIndex.getValue(1);
SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl).setChain(Chain)
.setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
- DAG.getExternalSymbol("__trampoline_setup", PtrVT), &Args, 0);
+ DAG.getExternalSymbol("__trampoline_setup", PtrVT),
+ std::move(Args), 0);
std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
return CallResult.second;
unsigned ArgSize = ArgVT.getStoreSize();
if (Flags.isByVal())
ArgSize = Flags.getByValSize();
- ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+
+ // Round up to multiples of the pointer size, except for array members,
+ // which are always packed.
+ if (!Flags.isInConsecutiveRegs())
+ ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
return ArgSize;
}
+
+/// CalculateStackSlotAlignment - Calculates the alignment of this argument
+/// on the stack.
+static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
+ ISD::ArgFlagsTy Flags,
+ unsigned PtrByteSize) {
+ unsigned Align = PtrByteSize;
+
+ // Altivec parameters are padded to a 16 byte boundary.
+ if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
+ ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
+ ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64)
+ Align = 16;
+
+ // ByVal parameters are aligned as requested.
+ if (Flags.isByVal()) {
+ unsigned BVAlign = Flags.getByValAlign();
+ if (BVAlign > PtrByteSize) {
+ if (BVAlign % PtrByteSize != 0)
+ llvm_unreachable(
+ "ByVal alignment is not a multiple of the pointer size");
+
+ Align = BVAlign;
+ }
+ }
+
+ // Array members are always packed to their original alignment.
+ if (Flags.isInConsecutiveRegs()) {
+ // If the array member was split into multiple registers, the first
+ // needs to be aligned to the size of the full type. (Except for
+ // ppcf128, which is only aligned as its f64 components.)
+ if (Flags.isSplit() && OrigVT != MVT::ppcf128)
+ Align = OrigVT.getStoreSize();
+ else
+ Align = ArgVT.getStoreSize();
+ }
+
+ return Align;
+}
+
+/// CalculateStackSlotUsed - Return whether this argument will use its
+/// stack slot (instead of being passed in registers). ArgOffset,
+/// AvailableFPRs, and AvailableVRs must hold the current argument
+/// position, and will be updated to account for this argument.
+static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
+ ISD::ArgFlagsTy Flags,
+ unsigned PtrByteSize,
+ unsigned LinkageSize,
+ unsigned ParamAreaSize,
+ unsigned &ArgOffset,
+ unsigned &AvailableFPRs,
+ unsigned &AvailableVRs) {
+ bool UseMemory = false;
+
+ // Respect alignment of argument on the stack.
+ unsigned Align =
+ CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
+ ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
+ // If there's no space left in the argument save area, we must
+ // use memory (this check also catches zero-sized arguments).
+ if (ArgOffset >= LinkageSize + ParamAreaSize)
+ UseMemory = true;
+
+ // Allocate argument on the stack.
+ ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
+ if (Flags.isInConsecutiveRegsLast())
+ ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+ // If we overran the argument save area, we must use memory
+ // (this check catches arguments passed partially in memory)
+ if (ArgOffset > LinkageSize + ParamAreaSize)
+ UseMemory = true;
+
+ // However, if the argument is actually passed in an FPR or a VR,
+ // we don't use memory after all.
+ if (!Flags.isByVal()) {
+ if (ArgVT == MVT::f32 || ArgVT == MVT::f64)
+ if (AvailableFPRs > 0) {
+ --AvailableFPRs;
+ return false;
+ }
+ if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
+ ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
+ ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64)
+ if (AvailableVRs > 0) {
+ --AvailableVRs;
+ return false;
+ }
+ }
+
+ return UseMemory;
+}
+
/// EnsureStackAlignment - Round stack frame size up from NumBytes to
/// ensure minimum alignment required for target.
static unsigned EnsureStackAlignment(const TargetMachine &Target,
unsigned NumBytes) {
- unsigned TargetAlign = Target.getFrameLowering()->getStackAlignment();
+ unsigned TargetAlign =
+ Target.getSubtargetImpl()->getFrameLowering()->getStackAlignment();
unsigned AlignMask = TargetAlign - 1;
NumBytes = (NumBytes + AlignMask) & ~AlignMask;
return NumBytes;
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
- getTargetMachine(), ArgLocs, *DAG.getContext());
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
// Reserve space for the linkage area on the stack.
- CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize);
+ unsigned LinkageSize = PPCFrameLowering::getLinkageSize(false, false, false);
+ CCInfo.AllocateStack(LinkageSize, PtrByteSize);
CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4);
// caller's stack frame, right above the parameter list area.
SmallVector<CCValAssign, 16> ByValArgLocs;
CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
- getTargetMachine(), ByValArgLocs, *DAG.getContext());
+ ByValArgLocs, *DAG.getContext());
// Reserve stack space for the allocations in CCInfo.
CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
// Area that is at least reserved in the caller of this function.
unsigned MinReservedArea = CCByValInfo.getNextStackOffset();
- MinReservedArea =
- std::max(MinReservedArea,
- PPCFrameLowering::getMinCallFrameSize(false, false));
+ MinReservedArea = std::max(MinReservedArea, LinkageSize);
// Set the size that is at least reserved in caller of this function. Tail
// call optimized function's reserved stack space needs to be aligned so that
PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
PPC::F8
};
- const unsigned NumFPArgRegs = array_lengthof(FPArgRegs);
+ unsigned NumFPArgRegs = array_lengthof(FPArgRegs);
+ if (DisablePPCFloatInVariadic)
+ NumFPArgRegs = 0;
FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs,
NumGPArgRegs));
// Make room for NumGPArgRegs and NumFPArgRegs.
int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 +
- NumFPArgRegs * EVT(MVT::f64).getSizeInBits()/8;
+ NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8;
FuncInfo->setVarArgsStackOffset(
MFI->CreateFixedObject(PtrVT.getSizeInBits()/8,
MachinePointerInfo(), false, false, 0);
MemOps.push_back(Store);
// Increment the address by eight for the next argument to store
- SDValue PtrOff = DAG.getConstant(EVT(MVT::f64).getSizeInBits()/8,
+ SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8,
PtrVT);
FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
}
SmallVectorImpl<SDValue> &InVals) const {
// TODO: add description of PPC stack frame format, or at least some docs.
//
+ bool isELFv2ABI = Subtarget.isELFv2ABI();
bool isLittleEndian = Subtarget.isLittleEndian();
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo *MFI = MF.getFrameInfo();
(CallConv == CallingConv::Fast));
unsigned PtrByteSize = 8;
- unsigned ArgOffset = PPCFrameLowering::getLinkageSize(true, true);
- // Area that is at least reserved in caller of this function.
- unsigned MinReservedArea = ArgOffset;
+ unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false,
+ isELFv2ABI);
static const MCPhysReg GPR[] = {
PPC::X3, PPC::X4, PPC::X5, PPC::X6,
const unsigned Num_FPR_Regs = 13;
const unsigned Num_VR_Regs = array_lengthof(VR);
- unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+ // Do a first pass over the arguments to determine whether the ABI
+ // guarantees that our caller has allocated the parameter save area
+ // on its stack frame. In the ELFv1 ABI, this is always the case;
+ // in the ELFv2 ABI, it is true if this is a vararg function or if
+ // any parameter is located in a stack slot.
+
+ bool HasParameterArea = !isELFv2ABI || isVarArg;
+ unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize;
+ unsigned NumBytes = LinkageSize;
+ unsigned AvailableFPRs = Num_FPR_Regs;
+ unsigned AvailableVRs = Num_VR_Regs;
+ for (unsigned i = 0, e = Ins.size(); i != e; ++i)
+ if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags,
+ PtrByteSize, LinkageSize, ParamAreaSize,
+ NumBytes, AvailableFPRs, AvailableVRs))
+ HasParameterArea = true;
// Add DAG nodes to load the arguments or copy them out of registers. On
// entry to a function on PPC, the arguments start after the linkage area,
// although the first ones are often in registers.
+ unsigned ArgOffset = LinkageSize;
+ unsigned GPR_idx, FPR_idx = 0, VR_idx = 0;
SmallVector<SDValue, 8> MemOps;
Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
unsigned CurArgIdx = 0;
SDValue ArgVal;
bool needsLoad = false;
EVT ObjectVT = Ins[ArgNo].VT;
+ EVT OrigVT = Ins[ArgNo].ArgVT;
unsigned ObjSize = ObjectVT.getStoreSize();
unsigned ArgSize = ObjSize;
ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
std::advance(FuncArg, Ins[ArgNo].OrigArgIndex - CurArgIdx);
CurArgIdx = Ins[ArgNo].OrigArgIndex;
+ /* Respect alignment of argument on the stack. */
+ unsigned Align =
+ CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize);
+ ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
unsigned CurArgOffset = ArgOffset;
- // Altivec parameters are padded to a 16 byte boundary.
- if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 ||
- ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8 ||
- ObjectVT==MVT::v2f64 || ObjectVT==MVT::v2i64)
- MinReservedArea = ((MinReservedArea+15)/16)*16;
-
- // Calculate min reserved area.
- MinReservedArea += CalculateStackSlotSize(ObjectVT, Flags, PtrByteSize);
+ /* Compute GPR index associated with argument offset. */
+ GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
+ GPR_idx = std::min(GPR_idx, Num_GPR_Regs);
// FIXME the codegen can be much improved in some cases.
// We do not have to keep everything in memory.
continue;
}
- unsigned BVAlign = Flags.getByValAlign();
- if (BVAlign > 8) {
- ArgOffset = ((ArgOffset+BVAlign-1)/BVAlign)*BVAlign;
- CurArgOffset = ArgOffset;
- }
-
- // All aggregates smaller than 8 bytes must be passed right-justified.
- if (ObjSize < PtrByteSize && !isLittleEndian)
- CurArgOffset = CurArgOffset + (PtrByteSize - ObjSize);
- // The value of the object is its address.
- int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true);
+ // Create a stack object covering all stack doublewords occupied
+ // by the argument. If the argument is (fully or partially) on
+ // the stack, or if the argument is fully in registers but the
+ // caller has allocated the parameter save anyway, we can refer
+ // directly to the caller's stack frame. Otherwise, create a
+ // local copy in our own frame.
+ int FI;
+ if (HasParameterArea ||
+ ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize)
+ FI = MFI->CreateFixedObject(ArgSize, ArgOffset, false, true);
+ else
+ FI = MFI->CreateStackObject(ArgSize, Align, false);
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
- InVals.push_back(FIN);
- if (ObjSize < 8) {
+ // Handle aggregates smaller than 8 bytes.
+ if (ObjSize < PtrByteSize) {
+ // The value of the object is its address, which differs from the
+ // address of the enclosing doubleword on big-endian systems.
+ SDValue Arg = FIN;
+ if (!isLittleEndian) {
+ SDValue ArgOff = DAG.getConstant(PtrByteSize - ObjSize, PtrVT);
+ Arg = DAG.getNode(ISD::ADD, dl, ArgOff.getValueType(), Arg, ArgOff);
+ }
+ InVals.push_back(Arg);
+
if (GPR_idx != Num_GPR_Regs) {
unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
if (ObjSize==1 || ObjSize==2 || ObjSize==4) {
EVT ObjType = (ObjSize == 1 ? MVT::i8 :
(ObjSize == 2 ? MVT::i16 : MVT::i32));
- Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
+ Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg,
MachinePointerInfo(FuncArg),
ObjType, false, false, 0);
} else {
// For sizes that don't fit a truncating store (3, 5, 6, 7),
// store the whole register as-is to the parameter save area
- // slot. The address of the parameter was already calculated
- // above (InVals.push_back(FIN)) to be the right-justified
- // offset within the slot. For this store, we need a new
- // frame index that points at the beginning of the slot.
- int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
- SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+ // slot.
Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
MachinePointerInfo(FuncArg),
false, false, 0);
}
MemOps.push_back(Store);
- ++GPR_idx;
}
// Whether we copied from a register or not, advance the offset
// into the parameter save area by a full doubleword.
continue;
}
+ // The value of the object is its address, which is the address of
+ // its first stack doubleword.
+ InVals.push_back(FIN);
+
+ // Store whatever pieces of the object are in registers to memory.
for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
- // Store whatever pieces of the object are in registers
- // to memory. ArgOffset will be the address of the beginning
- // of the object.
- if (GPR_idx != Num_GPR_Regs) {
- unsigned VReg;
- VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
- int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
- SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
- SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
- SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
- MachinePointerInfo(FuncArg, j),
- false, false, 0);
- MemOps.push_back(Store);
- ++GPR_idx;
- ArgOffset += PtrByteSize;
- } else {
- ArgOffset += ArgSize - j;
+ if (GPR_idx == Num_GPR_Regs)
break;
+
+ unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+ SDValue Addr = FIN;
+ if (j) {
+ SDValue Off = DAG.getConstant(j, PtrVT);
+ Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off);
}
+ SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr,
+ MachinePointerInfo(FuncArg, j),
+ false, false, 0);
+ MemOps.push_back(Store);
+ ++GPR_idx;
}
+ ArgOffset += ArgSize;
continue;
}
case MVT::i1:
case MVT::i32:
case MVT::i64:
+ // These can be scalar arguments or elements of an integer array type
+ // passed directly. Clang may use those instead of "byval" aggregate
+ // types to avoid forcing arguments to memory unnecessarily.
if (GPR_idx != Num_GPR_Regs) {
unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
// value to MVT::i64 and then truncate to the correct register size.
ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
-
- ++GPR_idx;
} else {
needsLoad = true;
ArgSize = PtrByteSize;
case MVT::f32:
case MVT::f64:
- // Every 8 bytes of argument space consumes one of the GPRs available for
- // argument passing.
- if (GPR_idx != Num_GPR_Regs) {
- ++GPR_idx;
- }
+ // These can be scalar arguments or elements of a float array type
+ // passed directly. The latter are used to implement ELFv2 homogenous
+ // float aggregates.
if (FPR_idx != Num_FPR_Regs) {
unsigned VReg;
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
++FPR_idx;
+ } else if (GPR_idx != Num_GPR_Regs) {
+ // This can only ever happen in the presence of f32 array types,
+ // since otherwise we never run out of FPRs before running out
+ // of GPRs.
+ unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+ ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+
+ if (ObjectVT == MVT::f32) {
+ if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0))
+ ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal,
+ DAG.getConstant(32, MVT::i32));
+ ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);
+ }
+
+ ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal);
} else {
needsLoad = true;
- ArgSize = PtrByteSize;
}
- ArgOffset += 8;
+ // When passing an array of floats, the array occupies consecutive
+ // space in the argument area; only round up to the next doubleword
+ // at the end of the array. Otherwise, each float takes 8 bytes.
+ ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize;
+ ArgOffset += ArgSize;
+ if (Flags.isInConsecutiveRegsLast())
+ ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
break;
case MVT::v4f32:
case MVT::v4i32:
case MVT::v16i8:
case MVT::v2f64:
case MVT::v2i64:
- // Vectors are aligned to a 16-byte boundary in the argument save area.
- while ((ArgOffset % 16) != 0) {
- ArgOffset += PtrByteSize;
- if (GPR_idx != Num_GPR_Regs)
- GPR_idx++;
- }
+ // These can be scalar arguments or elements of a vector array type
+ // passed directly. The latter are used to implement ELFv2 homogenous
+ // vector aggregates.
if (VR_idx != Num_VR_Regs) {
unsigned VReg = (ObjectVT == MVT::v2f64 || ObjectVT == MVT::v2i64) ?
MF.addLiveIn(VSRH[VR_idx], &PPC::VSHRCRegClass) :
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
++VR_idx;
} else {
- CurArgOffset = ArgOffset;
needsLoad = true;
}
ArgOffset += 16;
- GPR_idx = std::min(GPR_idx + 2, Num_GPR_Regs);
break;
}
}
// Area that is at least reserved in the caller of this function.
- MinReservedArea =
- std::max(MinReservedArea,
- PPCFrameLowering::getMinCallFrameSize(true, true));
+ unsigned MinReservedArea;
+ if (HasParameterArea)
+ MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize);
+ else
+ MinReservedArea = LinkageSize;
// Set the size that is at least reserved in caller of this function. Tail
// call optimized functions' reserved stack space needs to be aligned so that
// If this function is vararg, store any remaining integer argument regs
// to their spots on the stack so that they may be loaded by deferencing the
// result of va_next.
- for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) {
+ for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
+ GPR_idx < Num_GPR_Regs; ++GPR_idx) {
unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
(CallConv == CallingConv::Fast));
unsigned PtrByteSize = isPPC64 ? 8 : 4;
- unsigned ArgOffset = PPCFrameLowering::getLinkageSize(isPPC64, true);
+ unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true,
+ false);
+ unsigned ArgOffset = LinkageSize;
// Area that is at least reserved in caller of this function.
unsigned MinReservedArea = ArgOffset;
CurArgOffset = CurArgOffset + (4 - ObjSize);
}
// The value of the object is its address.
- int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true);
+ int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, false, true);
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
InVals.push_back(FIN);
if (ObjSize==1 || ObjSize==2) {
}
// Area that is at least reserved in the caller of this function.
- MinReservedArea =
- std::max(MinReservedArea,
- PPCFrameLowering::getMinCallFrameSize(isPPC64, true));
+ MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize);
// Set the size that is at least reserved in caller of this function. Tail
// call optimized functions' reserved stack space needs to be aligned so that
InFlag = Chain.getValue(1);
}
+// Is this global address that of a function that can be called by name? (as
+// opposed to something that must hold a descriptor for an indirect call).
+static bool isFunctionGlobalAddress(SDValue Callee) {
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ if (Callee.getOpcode() == ISD::GlobalTLSAddress ||
+ Callee.getOpcode() == ISD::TargetGlobalTLSAddress)
+ return false;
+
+ return G->getGlobal()->getType()->getElementType()->isFunctionTy();
+ }
+
+ return false;
+}
+
static
unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
SDValue &Chain, SDLoc dl, int SPDiff, bool isTailCall,
+ bool IsPatchPoint,
SmallVectorImpl<std::pair<unsigned, SDValue> > &RegsToPass,
SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys,
const PPCSubtarget &Subtarget) {
bool isPPC64 = Subtarget.isPPC64();
bool isSVR4ABI = Subtarget.isSVR4ABI();
+ bool isELFv2ABI = Subtarget.isELFv2ABI();
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
NodeTys.push_back(MVT::Other); // Returns a chain
needIndirectCall = false;
}
- if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
- // XXX Work around for http://llvm.org/bugs/show_bug.cgi?id=5201
- // Use indirect calls for ALL functions calls in JIT mode, since the
- // far-call stubs may be outside relocation limits for a BL instruction.
- if (!DAG.getTarget().getSubtarget<PPCSubtarget>().isJITCodeModel()) {
- unsigned OpFlags = 0;
- if (DAG.getTarget().getRelocationModel() != Reloc::Static &&
- (Subtarget.getTargetTriple().isMacOSX() &&
- Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5)) &&
- (G->getGlobal()->isDeclaration() ||
- G->getGlobal()->isWeakForLinker())) {
- // PC-relative references to external symbols should go through $stub,
- // unless we're building with the leopard linker or later, which
- // automatically synthesizes these stubs.
- OpFlags = PPCII::MO_DARWIN_STUB;
- }
-
- // If the callee is a GlobalAddress/ExternalSymbol node (quite common,
- // every direct call is) turn it into a TargetGlobalAddress /
- // TargetExternalSymbol node so that legalize doesn't hack it.
- Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl,
- Callee.getValueType(),
- 0, OpFlags);
- needIndirectCall = false;
+ if (isFunctionGlobalAddress(Callee)) {
+ GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee);
+ // A call to a TLS address is actually an indirect call to a
+ // thread-specific pointer.
+ unsigned OpFlags = 0;
+ if ((DAG.getTarget().getRelocationModel() != Reloc::Static &&
+ (Subtarget.getTargetTriple().isMacOSX() &&
+ Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5)) &&
+ (G->getGlobal()->isDeclaration() ||
+ G->getGlobal()->isWeakForLinker())) ||
+ (Subtarget.isTargetELF() && !isPPC64 &&
+ !G->getGlobal()->hasLocalLinkage() &&
+ DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
+ // PC-relative references to external symbols should go through $stub,
+ // unless we're building with the leopard linker or later, which
+ // automatically synthesizes these stubs.
+ OpFlags = PPCII::MO_PLT_OR_STUB;
}
+
+ // If the callee is a GlobalAddress/ExternalSymbol node (quite common,
+ // every direct call is) turn it into a TargetGlobalAddress /
+ // TargetExternalSymbol node so that legalize doesn't hack it.
+ Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl,
+ Callee.getValueType(), 0, OpFlags);
+ needIndirectCall = false;
}
if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
unsigned char OpFlags = 0;
- if (DAG.getTarget().getRelocationModel() != Reloc::Static &&
- (Subtarget.getTargetTriple().isMacOSX() &&
- Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5))) {
+ if ((DAG.getTarget().getRelocationModel() != Reloc::Static &&
+ (Subtarget.getTargetTriple().isMacOSX() &&
+ Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5))) ||
+ (Subtarget.isTargetELF() && !isPPC64 &&
+ DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
// PC-relative references to external symbols should go through $stub,
// unless we're building with the leopard linker or later, which
// automatically synthesizes these stubs.
- OpFlags = PPCII::MO_DARWIN_STUB;
+ OpFlags = PPCII::MO_PLT_OR_STUB;
}
Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType(),
needIndirectCall = false;
}
+ if (IsPatchPoint) {
+ // We'll form an invalid direct call when lowering a patchpoint; the full
+ // sequence for an indirect call is complicated, and many of the
+ // instructions introduced might have side effects (and, thus, can't be
+ // removed later). The call itself will be removed as soon as the
+ // argument/return lowering is complete, so the fact that it has the wrong
+ // kind of operands should not really matter.
+ needIndirectCall = false;
+ }
+
if (needIndirectCall) {
// Otherwise, this is an indirect call. We have to use a MTCTR/BCTRL pair
// to do the call, we can't use PPCISD::CALL.
SDValue MTCTROps[] = {Chain, Callee, InFlag};
- if (isSVR4ABI && isPPC64) {
+ if (isSVR4ABI && isPPC64 && !isELFv2ABI) {
// Function pointers in the 64-bit SVR4 ABI do not point to the function
// entry point, but to the function descriptor (the function entry point
// address is part of the function descriptor though).
CallOpc = PPCISD::BCTRL;
Callee.setNode(nullptr);
// Add use of X11 (holding environment pointer)
- if (isSVR4ABI && isPPC64)
+ if (isSVR4ABI && isPPC64 && !isELFv2ABI)
Ops.push_back(DAG.getRegister(PPC::X11, PtrVT));
// Add CTR register as callee so a bctr can be emitted later.
if (isTailCall)
if (Callee.getNode()) {
Ops.push_back(Chain);
Ops.push_back(Callee);
+
+ // If this is a call to __tls_get_addr, find the symbol whose address
+ // is to be taken and add it to the list. This will be used to
+ // generate __tls_get_addr(<sym>@tlsgd) or __tls_get_addr(<sym>@tlsld).
+ // We find the symbol by walking the chain to the CopyFromReg, walking
+ // back from the CopyFromReg to the ADDI_TLSGD_L or ADDI_TLSLD_L, and
+ // pulling the symbol from that node.
+ if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
+ if (!strcmp(S->getSymbol(), "__tls_get_addr")) {
+ assert(!needIndirectCall && "Indirect call to __tls_get_addr???");
+ SDNode *AddI = Chain.getNode()->getOperand(2).getNode();
+ SDValue TGTAddr = AddI->getOperand(1);
+ assert(TGTAddr.getNode()->getOpcode() == ISD::TargetGlobalTLSAddress &&
+ "Didn't find target global TLS address where we expected one");
+ Ops.push_back(TGTAddr);
+ CallOpc = PPCISD::CALL_TLS;
+ }
}
// If this is a tail call add stack pointer delta.
if (isTailCall)
Ops.push_back(DAG.getRegister(RegsToPass[i].first,
RegsToPass[i].second.getValueType()));
+ // Direct calls in the ELFv2 ABI need the TOC register live into the call.
+ if (Callee.getNode() && isELFv2ABI && !IsPatchPoint)
+ Ops.push_back(DAG.getRegister(PPC::X2, PtrVT));
+
return CallOpc;
}
SmallVectorImpl<SDValue> &InVals) const {
SmallVector<CCValAssign, 16> RVLocs;
- CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(),
- getTargetMachine(), RVLocs, *DAG.getContext());
+ CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC);
// Copy all of the result registers out of their specified physreg.
SDValue
PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
- bool isTailCall, bool isVarArg,
+ bool isTailCall, bool isVarArg, bool IsPatchPoint,
SelectionDAG &DAG,
SmallVector<std::pair<unsigned, SDValue>, 8>
&RegsToPass,
int SPDiff, unsigned NumBytes,
const SmallVectorImpl<ISD::InputArg> &Ins,
SmallVectorImpl<SDValue> &InVals) const {
+
+ bool isELFv2ABI = Subtarget.isELFv2ABI();
std::vector<EVT> NodeTys;
SmallVector<SDValue, 8> Ops;
unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, dl, SPDiff,
- isTailCall, RegsToPass, Ops, NodeTys,
- Subtarget);
+ isTailCall, IsPatchPoint, RegsToPass, Ops,
+ NodeTys, Subtarget);
// Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
if (isVarArg && Subtarget.isSVR4ABI() && !Subtarget.isPPC64())
getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0;
// Add a register mask operand representing the call-preserved registers.
- const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+ const TargetRegisterInfo *TRI =
+ getTargetMachine().getSubtargetImpl()->getRegisterInfo();
const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
// stack frame. If caller and callee belong to the same module (and have the
// same TOC), the NOP will remain unchanged.
- bool needsTOCRestore = false;
- if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64()) {
+ if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64() &&
+ !IsPatchPoint) {
if (CallOpc == PPCISD::BCTRL) {
// This is a call through a function pointer.
// Restore the caller TOC from the save area into R2.
// since r2 is a reserved register (which prevents the register allocator
// from allocating it), resulting in an additional register being
// allocated and an unnecessary move instruction being generated.
- needsTOCRestore = true;
+ CallOpc = PPCISD::BCTRL_LOAD_TOC;
+
+ EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+ SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT);
+ unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(isELFv2ABI);
+ SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset);
+ SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff);
+
+ // The address needs to go after the chain input but before the flag (or
+ // any other variadic arguments).
+ Ops.insert(std::next(Ops.begin()), AddTOC);
} else if ((CallOpc == PPCISD::CALL) &&
(!isLocalCall(Callee) ||
DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
// Otherwise insert NOP for non-local calls.
CallOpc = PPCISD::CALL_NOP;
- }
+ } else if (CallOpc == PPCISD::CALL_TLS)
+ // For 64-bit SVR4, TLS calls are always non-local.
+ CallOpc = PPCISD::CALL_NOP_TLS;
}
Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
InFlag = Chain.getValue(1);
- if (needsTOCRestore) {
- SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
- EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
- SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT);
- unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset();
- SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset);
- SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff);
- Chain = DAG.getNode(PPCISD::LOAD_TOC, dl, VTs, Chain, AddTOC, InFlag);
- InFlag = Chain.getValue(1);
- }
-
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
DAG.getIntPtrConstant(BytesCalleePops, true),
InFlag, dl);
bool &isTailCall = CLI.IsTailCall;
CallingConv::ID CallConv = CLI.CallConv;
bool isVarArg = CLI.IsVarArg;
+ bool IsPatchPoint = CLI.IsPatchPoint;
if (isTailCall)
isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
if (Subtarget.isSVR4ABI()) {
if (Subtarget.isPPC64())
return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg,
- isTailCall, Outs, OutVals, Ins,
+ isTailCall, IsPatchPoint, Outs, OutVals, Ins,
dl, DAG, InVals);
else
return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg,
- isTailCall, Outs, OutVals, Ins,
+ isTailCall, IsPatchPoint, Outs, OutVals, Ins,
dl, DAG, InVals);
}
return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg,
- isTailCall, Outs, OutVals, Ins,
+ isTailCall, IsPatchPoint, Outs, OutVals, Ins,
dl, DAG, InVals);
}
SDValue
PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
CallingConv::ID CallConv, bool isVarArg,
- bool isTailCall,
+ bool isTailCall, bool IsPatchPoint,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins,
// Assign locations to all of the outgoing arguments.
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
- getTargetMachine(), ArgLocs, *DAG.getContext());
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
// Reserve space for the linkage area on the stack.
- CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize);
+ CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false, false),
+ PtrByteSize);
if (isVarArg) {
// Handle fixed and variable vector arguments differently.
// Assign locations to all of the outgoing aggregate by value arguments.
SmallVector<CCValAssign, 16> ByValArgLocs;
CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
- getTargetMachine(), ByValArgLocs, *DAG.getContext());
+ ByValArgLocs, *DAG.getContext());
// Reserve stack space for the allocations in CCInfo.
CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
PrepareTailCall(DAG, InFlag, Chain, dl, false, SPDiff, NumBytes, LROp, FPOp,
false, TailCallArguments);
- return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG,
+ return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, DAG,
RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes,
Ins, InVals);
}
SDValue
PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
CallingConv::ID CallConv, bool isVarArg,
- bool isTailCall,
+ bool isTailCall, bool IsPatchPoint,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins,
SDLoc dl, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals) const {
+ bool isELFv2ABI = Subtarget.isELFv2ABI();
bool isLittleEndian = Subtarget.isLittleEndian();
unsigned NumOps = Outs.size();
MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
// Count how many bytes are to be pushed on the stack, including the linkage
- // area, and parameter passing area. We start with at least 48 bytes, which
- // is reserved space for [SP][CR][LR][3 x unused].
- unsigned NumBytes = PPCFrameLowering::getLinkageSize(true, true);
+ // area, and parameter passing area. On ELFv1, the linkage area is 48 bytes
+ // reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage
+ // area is 32 bytes reserved space for [SP][CR][LR][TOC].
+ unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false,
+ isELFv2ABI);
+ unsigned NumBytes = LinkageSize;
// Add up all the space actually used.
for (unsigned i = 0; i != NumOps; ++i) {
ISD::ArgFlagsTy Flags = Outs[i].Flags;
EVT ArgVT = Outs[i].VT;
+ EVT OrigVT = Outs[i].ArgVT;
- // Altivec parameters are padded to a 16 byte boundary.
- if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
- ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
- ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64)
- NumBytes = ((NumBytes+15)/16)*16;
+ /* Respect alignment of argument on the stack. */
+ unsigned Align =
+ CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
+ NumBytes = ((NumBytes + Align - 1) / Align) * Align;
NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
+ if (Flags.isInConsecutiveRegsLast())
+ NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
}
+ unsigned NumBytesActuallyUsed = NumBytes;
+
// The prolog code of the callee may store up to 8 GPR argument registers to
// the stack, allowing va_start to index over them in memory if its varargs.
// Because we cannot tell if this is needed on the caller side, we have to
// conservatively assume that it is needed. As such, make sure we have at
// least enough stack space for the caller to store the 8 GPRs.
- NumBytes = std::max(NumBytes,
- PPCFrameLowering::getMinCallFrameSize(true, true));
+ // FIXME: On ELFv2, it may be unnecessary to allocate the parameter area.
+ NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
// Tail call needs the stack to be aligned.
if (getTargetMachine().Options.GuaranteedTailCallOpt &&
// memory. Also, if this is a vararg function, floating point operations
// must be stored to our stack, and loaded into integer regs as well, if
// any integer regs are available for argument passing.
- unsigned ArgOffset = PPCFrameLowering::getLinkageSize(true, true);
- unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+ unsigned ArgOffset = LinkageSize;
+ unsigned GPR_idx, FPR_idx = 0, VR_idx = 0;
static const MCPhysReg GPR[] = {
PPC::X3, PPC::X4, PPC::X5, PPC::X6,
for (unsigned i = 0; i != NumOps; ++i) {
SDValue Arg = OutVals[i];
ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ EVT ArgVT = Outs[i].VT;
+ EVT OrigVT = Outs[i].ArgVT;
+
+ /* Respect alignment of argument on the stack. */
+ unsigned Align =
+ CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
+ ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
+
+ /* Compute GPR index associated with argument offset. */
+ GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
+ GPR_idx = std::min(GPR_idx, NumGPRs);
// PtrOff will be used to store the current argument to the stack if a
// register cannot be found for it.
if (Size == 0)
continue;
- unsigned BVAlign = Flags.getByValAlign();
- if (BVAlign > 8) {
- if (BVAlign % PtrByteSize != 0)
- llvm_unreachable(
- "ByVal alignment is not a multiple of the pointer size");
-
- ArgOffset = ((ArgOffset+BVAlign-1)/BVAlign)*BVAlign;
- }
-
// All aggregates smaller than 8 bytes must be passed right-justified.
if (Size==1 || Size==2 || Size==4) {
EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
if (GPR_idx != NumGPRs) {
SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
MachinePointerInfo(), VT,
- false, false, 0);
+ false, false, false, 0);
MemOpChains.push_back(Load.getValue(1));
- RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+ RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load));
ArgOffset += PtrByteSize;
continue;
MachinePointerInfo(),
false, false, false, 0);
MemOpChains.push_back(Load.getValue(1));
- RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+ RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load));
// Done with this argument.
ArgOffset += PtrByteSize;
case MVT::i1:
case MVT::i32:
case MVT::i64:
+ // These can be scalar arguments or elements of an integer array type
+ // passed directly. Clang may use those instead of "byval" aggregate
+ // types to avoid forcing arguments to memory unnecessarily.
if (GPR_idx != NumGPRs) {
- RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
+ RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Arg));
} else {
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
true, isTailCall, false, MemOpChains,
ArgOffset += PtrByteSize;
break;
case MVT::f32:
- case MVT::f64:
- if (FPR_idx != NumFPRs) {
+ case MVT::f64: {
+ // These can be scalar arguments or elements of a float array type
+ // passed directly. The latter are used to implement ELFv2 homogenous
+ // float aggregates.
+
+ // Named arguments go into FPRs first, and once they overflow, the
+ // remaining arguments go into GPRs and then the parameter save area.
+ // Unnamed arguments for vararg functions always go to GPRs and
+ // then the parameter save area. For now, put all arguments to vararg
+ // routines always in both locations (FPR *and* GPR or stack slot).
+ bool NeedGPROrStack = isVarArg || FPR_idx == NumFPRs;
+
+ // First load the argument into the next available FPR.
+ if (FPR_idx != NumFPRs)
RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
- if (isVarArg) {
- // A single float or an aggregate containing only a single float
- // must be passed right-justified in the stack doubleword, and
- // in the GPR, if one is available.
- SDValue StoreOff;
- if (Arg.getSimpleValueType().SimpleTy == MVT::f32 &&
- !isLittleEndian) {
- SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
- StoreOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
- } else
- StoreOff = PtrOff;
-
- SDValue Store = DAG.getStore(Chain, dl, Arg, StoreOff,
- MachinePointerInfo(), false, false, 0);
- MemOpChains.push_back(Store);
-
- // Float varargs are always shadowed in available integer registers
- if (GPR_idx != NumGPRs) {
- SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff,
- MachinePointerInfo(), false, false,
- false, 0);
- MemOpChains.push_back(Load.getValue(1));
- RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
- }
- } else if (GPR_idx != NumGPRs)
- // If we have any FPRs remaining, we may also have GPRs remaining.
- ++GPR_idx;
+ // Next, load the argument into GPR or stack slot if needed.
+ if (!NeedGPROrStack)
+ ;
+ else if (GPR_idx != NumGPRs) {
+ // In the non-vararg case, this can only ever happen in the
+ // presence of f32 array types, since otherwise we never run
+ // out of FPRs before running out of GPRs.
+ SDValue ArgVal;
+
+ // Double values are always passed in a single GPR.
+ if (Arg.getValueType() != MVT::f32) {
+ ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
+
+ // Non-array float values are extended and passed in a GPR.
+ } else if (!Flags.isInConsecutiveRegs()) {
+ ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
+ ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);
+
+ // If we have an array of floats, we collect every odd element
+ // together with its predecessor into one GPR.
+ } else if (ArgOffset % PtrByteSize != 0) {
+ SDValue Lo, Hi;
+ Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]);
+ Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
+ if (!isLittleEndian)
+ std::swap(Lo, Hi);
+ ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+
+ // The final element, if even, goes into the first half of a GPR.
+ } else if (Flags.isInConsecutiveRegsLast()) {
+ ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
+ ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);
+ if (!isLittleEndian)
+ ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal,
+ DAG.getConstant(32, MVT::i32));
+
+ // Non-final even elements are skipped; they will be handled
+ // together the with subsequent argument on the next go-around.
+ } else
+ ArgVal = SDValue();
+
+ if (ArgVal.getNode())
+ RegsToPass.push_back(std::make_pair(GPR[GPR_idx], ArgVal));
} else {
// Single-precision floating-point values are mapped to the
// second (rightmost) word of the stack doubleword.
- if (Arg.getValueType() == MVT::f32 && !isLittleEndian) {
+ if (Arg.getValueType() == MVT::f32 &&
+ !isLittleEndian && !Flags.isInConsecutiveRegs()) {
SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
}
true, isTailCall, false, MemOpChains,
TailCallArguments, dl);
}
- ArgOffset += 8;
+ // When passing an array of floats, the array occupies consecutive
+ // space in the argument area; only round up to the next doubleword
+ // at the end of the array. Otherwise, each float takes 8 bytes.
+ ArgOffset += (Arg.getValueType() == MVT::f32 &&
+ Flags.isInConsecutiveRegs()) ? 4 : 8;
+ if (Flags.isInConsecutiveRegsLast())
+ ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
break;
+ }
case MVT::v4f32:
case MVT::v4i32:
case MVT::v8i16:
case MVT::v16i8:
case MVT::v2f64:
case MVT::v2i64:
- // Vectors are aligned to a 16-byte boundary in the argument save area.
- while (ArgOffset % 16 !=0) {
- ArgOffset += PtrByteSize;
- if (GPR_idx != NumGPRs)
- GPR_idx++;
- }
+ // These can be scalar arguments or elements of a vector array type
+ // passed directly. The latter are used to implement ELFv2 homogenous
+ // vector aggregates.
// For a varargs call, named arguments go into VRs or on the stack as
// usual; unnamed arguments always go to the stack or the corresponding
if (isVarArg) {
// We could elide this store in the case where the object fits
// entirely in R registers. Maybe later.
- PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
- DAG.getConstant(ArgOffset, PtrVT));
SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff,
MachinePointerInfo(), false, false, 0);
MemOpChains.push_back(Store);
TailCallArguments, dl);
}
ArgOffset += 16;
- GPR_idx = std::min(GPR_idx + 2, NumGPRs);
break;
}
}
+ assert(NumBytesActuallyUsed == ArgOffset);
+ (void)NumBytesActuallyUsed;
+
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
// Check if this is an indirect call (MTCTR/BCTRL).
// See PrepareCall() for more information about calls through function
// pointers in the 64-bit SVR4 ABI.
- if (!isTailCall &&
- !dyn_cast<GlobalAddressSDNode>(Callee) &&
- !dyn_cast<ExternalSymbolSDNode>(Callee)) {
+ if (!isTailCall && !IsPatchPoint &&
+ !isFunctionGlobalAddress(Callee) &&
+ !isa<ExternalSymbolSDNode>(Callee)) {
// Load r2 into a virtual register and store it to the TOC save area.
SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
// TOC save area offset.
- unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset();
+ unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(isELFv2ABI);
SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset);
SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, MachinePointerInfo(),
false, false, 0);
+ // In the ELFv2 ABI, R12 must contain the address of an indirect callee.
+ // This does not mean the MTCTR instruction must use R12; it's easier
+ // to model this as an extra parameter, so do that.
+ if (isELFv2ABI && !IsPatchPoint)
+ RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));
}
// Build a sequence of copy-to-reg nodes chained together with token chain
PrepareTailCall(DAG, InFlag, Chain, dl, true, SPDiff, NumBytes, LROp,
FPOp, true, TailCallArguments);
- return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG,
+ return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, DAG,
RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes,
Ins, InVals);
}
SDValue
PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
CallingConv::ID CallConv, bool isVarArg,
- bool isTailCall,
+ bool isTailCall, bool IsPatchPoint,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins,
// Count how many bytes are to be pushed on the stack, including the linkage
// area, and parameter passing area. We start with 24/48 bytes, which is
// prereserved space for [SP][CR][LR][3 x unused].
- unsigned NumBytes = PPCFrameLowering::getLinkageSize(isPPC64, true);
+ unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true,
+ false);
+ unsigned NumBytes = LinkageSize;
// Add up all the space actually used.
// In 32-bit non-varargs calls, Altivec parameters all go at the end; usually
// Because we cannot tell if this is needed on the caller side, we have to
// conservatively assume that it is needed. As such, make sure we have at
// least enough stack space for the caller to store the 8 GPRs.
- NumBytes = std::max(NumBytes,
- PPCFrameLowering::getMinCallFrameSize(isPPC64, true));
+ NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
// Tail call needs the stack to be aligned.
if (getTargetMachine().Options.GuaranteedTailCallOpt &&
// memory. Also, if this is a vararg function, floating point operations
// must be stored to our stack, and loaded into integer regs as well, if
// any integer regs are available for argument passing.
- unsigned ArgOffset = PPCFrameLowering::getLinkageSize(isPPC64, true);
+ unsigned ArgOffset = LinkageSize;
unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
static const MCPhysReg GPR_32[] = { // 32-bit registers.
if (GPR_idx != NumGPRs) {
SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
MachinePointerInfo(), VT,
- false, false, 0);
+ false, false, false, 0);
MemOpChains.push_back(Load.getValue(1));
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
// not mean the MTCTR instruction must use R12; it's easier to model this as
// an extra parameter, so do that.
if (!isTailCall &&
- !dyn_cast<GlobalAddressSDNode>(Callee) &&
- !dyn_cast<ExternalSymbolSDNode>(Callee) &&
+ !isFunctionGlobalAddress(Callee) &&
+ !isa<ExternalSymbolSDNode>(Callee) &&
!isBLACompatibleAddress(Callee, DAG))
RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 :
PPC::R12), Callee));
PrepareTailCall(DAG, InFlag, Chain, dl, isPPC64, SPDiff, NumBytes, LROp,
FPOp, true, TailCallArguments);
- return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG,
+ return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, DAG,
RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes,
Ins, InVals);
}
const SmallVectorImpl<ISD::OutputArg> &Outs,
LLVMContext &Context) const {
SmallVector<CCValAssign, 16> RVLocs;
- CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
- RVLocs, Context);
+ CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
return CCInfo.CheckReturn(Outs, RetCC_PPC);
}
SDLoc dl, SelectionDAG &DAG) const {
SmallVector<CCValAssign, 16> RVLocs;
- CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
- getTargetMachine(), RVLocs, *DAG.getContext());
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
CCInfo.AnalyzeReturn(Outs, RetCC_PPC);
SDValue Flag;
// Find out what the fix offset of the frame pointer save area.
int LROffset = PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI);
// Allocate the frame index for frame pointer save area.
- RASI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, LROffset, true);
+ RASI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, LROffset, false);
// Save the result.
FI->setReturnAddrSaveIndex(RASI);
}
return Op;
}
-// FIXME: Split this code up when LegalizeDAGTypes lands.
-SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
- SDLoc dl) const {
+void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
+ SelectionDAG &DAG,
+ SDLoc dl) const {
assert(Op.getOperand(0).getValueType().isFloatingPoint());
SDValue Src = Op.getOperand(0);
if (Src.getValueType() == MVT::f32)
if (Op.getValueType() == MVT::i32 && !i32Stack) {
FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr,
DAG.getConstant(4, FIPtr.getValueType()));
- MPI = MachinePointerInfo();
+ MPI = MPI.getWithOffset(4);
}
- return DAG.getLoad(Op.getValueType(), dl, Chain, FIPtr, MPI,
- false, false, false, 0);
+ RLI.Chain = Chain;
+ RLI.Ptr = FIPtr;
+ RLI.MPI = MPI;
}
-SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
+ SDLoc dl) const {
+ ReuseLoadInfo RLI;
+ LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
+
+ return DAG.getLoad(Op.getValueType(), dl, RLI.Chain, RLI.Ptr, RLI.MPI, false,
+ false, RLI.IsInvariant, RLI.Alignment, RLI.AAInfo,
+ RLI.Ranges);
+}
+
+// We're trying to insert a regular store, S, and then a load, L. If the
+// incoming value, O, is a load, we might just be able to have our load use the
+// address used by O. However, we don't know if anything else will store to
+// that address before we can load from it. To prevent this situation, we need
+// to insert our load, L, into the chain as a peer of O. To do this, we give L
+// the same chain operand as O, we create a token factor from the chain results
+// of O and L, and we replace all uses of O's chain result with that token
+// factor (see spliceIntoChain below for this last part).
+bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,
+ ReuseLoadInfo &RLI,
+ SelectionDAG &DAG,
+ ISD::LoadExtType ET) const {
SDLoc dl(Op);
- // Don't handle ppc_fp128 here; let it be lowered to a libcall.
- if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
- return SDValue();
+ if (ET == ISD::NON_EXTLOAD &&
+ (Op.getOpcode() == ISD::FP_TO_UINT ||
+ Op.getOpcode() == ISD::FP_TO_SINT) &&
+ isOperationLegalOrCustom(Op.getOpcode(),
+ Op.getOperand(0).getValueType())) {
+
+ LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
+ return true;
+ }
+
+ LoadSDNode *LD = dyn_cast<LoadSDNode>(Op);
+ if (!LD || LD->getExtensionType() != ET || LD->isVolatile() ||
+ LD->isNonTemporal())
+ return false;
+ if (LD->getMemoryVT() != MemVT)
+ return false;
+
+ RLI.Ptr = LD->getBasePtr();
+ if (LD->isIndexed() && LD->getOffset().getOpcode() != ISD::UNDEF) {
+ assert(LD->getAddressingMode() == ISD::PRE_INC &&
+ "Non-pre-inc AM on PPC?");
+ RLI.Ptr = DAG.getNode(ISD::ADD, dl, RLI.Ptr.getValueType(), RLI.Ptr,
+ LD->getOffset());
+ }
+
+ RLI.Chain = LD->getChain();
+ RLI.MPI = LD->getPointerInfo();
+ RLI.IsInvariant = LD->isInvariant();
+ RLI.Alignment = LD->getAlignment();
+ RLI.AAInfo = LD->getAAInfo();
+ RLI.Ranges = LD->getRanges();
+
+ RLI.ResChain = SDValue(LD, LD->isIndexed() ? 2 : 1);
+ return true;
+}
+
+// Given the head of the old chain, ResChain, insert a token factor containing
+// it and NewResChain, and make users of ResChain now be users of that token
+// factor.
+void PPCTargetLowering::spliceIntoChain(SDValue ResChain,
+ SDValue NewResChain,
+ SelectionDAG &DAG) const {
+ if (!ResChain)
+ return;
+
+ SDLoc dl(NewResChain);
+
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ NewResChain, DAG.getUNDEF(MVT::Other));
+ assert(TF.getNode() != NewResChain.getNode() &&
+ "A new TF really is required here");
+
+ DAG.ReplaceAllUsesOfValueWith(ResChain, TF);
+ DAG.UpdateNodeOperands(TF.getNode(), ResChain, NewResChain);
+}
+
+SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ // Don't handle ppc_fp128 here; let it be lowered to a libcall.
+ if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
+ return SDValue();
if (Op.getOperand(0).getValueType() == MVT::i1)
return DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Op.getOperand(0),
SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT);
}
- SDValue Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT);
+ ReuseLoadInfo RLI;
+ SDValue Bits;
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ if (canReuseLoadAddress(SINT, MVT::i64, RLI, DAG)) {
+ Bits = DAG.getLoad(MVT::f64, dl, RLI.Chain, RLI.Ptr, RLI.MPI, false,
+ false, RLI.IsInvariant, RLI.Alignment, RLI.AAInfo,
+ RLI.Ranges);
+ spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
+ } else if (Subtarget.hasLFIWAX() &&
+ canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::SEXTLOAD)) {
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
+ RLI.Alignment, RLI.AAInfo, RLI.Ranges);
+ SDValue Ops[] = { RLI.Chain, RLI.Ptr };
+ Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWAX, dl,
+ DAG.getVTList(MVT::f64, MVT::Other),
+ Ops, MVT::i32, MMO);
+ spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
+ } else if (Subtarget.hasFPCVT() &&
+ canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::ZEXTLOAD)) {
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
+ RLI.Alignment, RLI.AAInfo, RLI.Ranges);
+ SDValue Ops[] = { RLI.Chain, RLI.Ptr };
+ Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWZX, dl,
+ DAG.getVTList(MVT::f64, MVT::Other),
+ Ops, MVT::i32, MMO);
+ spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
+ } else if (((Subtarget.hasLFIWAX() &&
+ SINT.getOpcode() == ISD::SIGN_EXTEND) ||
+ (Subtarget.hasFPCVT() &&
+ SINT.getOpcode() == ISD::ZERO_EXTEND)) &&
+ SINT.getOperand(0).getValueType() == MVT::i32) {
+ MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+ EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+ int FrameIdx = FrameInfo->CreateStackObject(4, 4, false);
+ SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+ SDValue Store =
+ DAG.getStore(DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx,
+ MachinePointerInfo::getFixedStack(FrameIdx),
+ false, false, 0);
+
+ assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
+ "Expected an i32 store");
+
+ RLI.Ptr = FIdx;
+ RLI.Chain = Store;
+ RLI.MPI = MachinePointerInfo::getFixedStack(FrameIdx);
+ RLI.Alignment = 4;
+
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
+ RLI.Alignment, RLI.AAInfo, RLI.Ranges);
+ SDValue Ops[] = { RLI.Chain, RLI.Ptr };
+ Bits = DAG.getMemIntrinsicNode(SINT.getOpcode() == ISD::ZERO_EXTEND ?
+ PPCISD::LFIWZX : PPCISD::LFIWAX,
+ dl, DAG.getVTList(MVT::f64, MVT::Other),
+ Ops, MVT::i32, MMO);
+ } else
+ Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT);
+
SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits);
if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
SDValue Ld;
if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) {
- int FrameIdx = FrameInfo->CreateStackObject(4, 4, false);
- SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
-
- SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
- MachinePointerInfo::getFixedStack(FrameIdx),
- false, false, 0);
+ ReuseLoadInfo RLI;
+ bool ReusingLoad;
+ if (!(ReusingLoad = canReuseLoadAddress(Op.getOperand(0), MVT::i32, RLI,
+ DAG))) {
+ int FrameIdx = FrameInfo->CreateStackObject(4, 4, false);
+ SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+ SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
+ MachinePointerInfo::getFixedStack(FrameIdx),
+ false, false, 0);
+
+ assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
+ "Expected an i32 store");
+
+ RLI.Ptr = FIdx;
+ RLI.Chain = Store;
+ RLI.MPI = MachinePointerInfo::getFixedStack(FrameIdx);
+ RLI.Alignment = 4;
+ }
- assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
- "Expected an i32 store");
MachineMemOperand *MMO =
- MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
- MachineMemOperand::MOLoad, 4, 4);
- SDValue Ops[] = { Store, FIdx };
+ MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
+ RLI.Alignment, RLI.AAInfo, RLI.Ranges);
+ SDValue Ops[] = { RLI.Chain, RLI.Ptr };
Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ?
PPCISD::LFIWZX : PPCISD::LFIWAX,
dl, DAG.getVTList(MVT::f64, MVT::Other),
Ops, MVT::i32, MMO);
+ if (ReusingLoad)
+ spliceIntoChain(RLI.ResChain, Ld.getValue(1), DAG);
} else {
assert(Subtarget.isPPC64() &&
"i32->FP without LFIWAX supported only on PPC64");
if (PPC::isSplatShuffleMask(SVOp, 1) ||
PPC::isSplatShuffleMask(SVOp, 2) ||
PPC::isSplatShuffleMask(SVOp, 4) ||
- PPC::isVPKUWUMShuffleMask(SVOp, true, DAG) ||
- PPC::isVPKUHUMShuffleMask(SVOp, true, DAG) ||
- PPC::isVSLDOIShuffleMask(SVOp, true, DAG) != -1 ||
- PPC::isVMRGLShuffleMask(SVOp, 1, true, DAG) ||
- PPC::isVMRGLShuffleMask(SVOp, 2, true, DAG) ||
- PPC::isVMRGLShuffleMask(SVOp, 4, true, DAG) ||
- PPC::isVMRGHShuffleMask(SVOp, 1, true, DAG) ||
- PPC::isVMRGHShuffleMask(SVOp, 2, true, DAG) ||
- PPC::isVMRGHShuffleMask(SVOp, 4, true, DAG)) {
+ PPC::isVPKUWUMShuffleMask(SVOp, 1, DAG) ||
+ PPC::isVPKUHUMShuffleMask(SVOp, 1, DAG) ||
+ PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) != -1 ||
+ PPC::isVMRGLShuffleMask(SVOp, 1, 1, DAG) ||
+ PPC::isVMRGLShuffleMask(SVOp, 2, 1, DAG) ||
+ PPC::isVMRGLShuffleMask(SVOp, 4, 1, DAG) ||
+ PPC::isVMRGHShuffleMask(SVOp, 1, 1, DAG) ||
+ PPC::isVMRGHShuffleMask(SVOp, 2, 1, DAG) ||
+ PPC::isVMRGHShuffleMask(SVOp, 4, 1, DAG)) {
return Op;
}
}
// Altivec has a variety of "shuffle immediates" that take two vector inputs
// and produce a fixed permutation. If any of these match, do not lower to
// VPERM.
- if (PPC::isVPKUWUMShuffleMask(SVOp, false, DAG) ||
- PPC::isVPKUHUMShuffleMask(SVOp, false, DAG) ||
- PPC::isVSLDOIShuffleMask(SVOp, false, DAG) != -1 ||
- PPC::isVMRGLShuffleMask(SVOp, 1, false, DAG) ||
- PPC::isVMRGLShuffleMask(SVOp, 2, false, DAG) ||
- PPC::isVMRGLShuffleMask(SVOp, 4, false, DAG) ||
- PPC::isVMRGHShuffleMask(SVOp, 1, false, DAG) ||
- PPC::isVMRGHShuffleMask(SVOp, 2, false, DAG) ||
- PPC::isVMRGHShuffleMask(SVOp, 4, false, DAG))
+ unsigned int ShuffleKind = isLittleEndian ? 2 : 0;
+ if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) ||
+ PPC::isVPKUHUMShuffleMask(SVOp, ShuffleKind, DAG) ||
+ PPC::isVSLDOIShuffleMask(SVOp, ShuffleKind, DAG) != -1 ||
+ PPC::isVMRGLShuffleMask(SVOp, 1, ShuffleKind, DAG) ||
+ PPC::isVMRGLShuffleMask(SVOp, 2, ShuffleKind, DAG) ||
+ PPC::isVMRGLShuffleMask(SVOp, 4, ShuffleKind, DAG) ||
+ PPC::isVMRGHShuffleMask(SVOp, 1, ShuffleKind, DAG) ||
+ PPC::isVMRGHShuffleMask(SVOp, 2, ShuffleKind, DAG) ||
+ PPC::isVMRGHShuffleMask(SVOp, 4, ShuffleKind, DAG))
return Op;
// Check to see if this is a shuffle of 4-byte values. If so, we can use our
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
case ISD::FP_TO_UINT:
case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG,
- SDLoc(Op));
+ SDLoc(Op));
case ISD::UINT_TO_FP:
case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
switch (N->getOpcode()) {
default:
llvm_unreachable("Do not know how to custom type legalize this operation!");
+ case ISD::READCYCLECOUNTER: {
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
+ SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0));
+
+ Results.push_back(RTB);
+ Results.push_back(RTB.getValue(1));
+ Results.push_back(RTB.getValue(2));
+ break;
+ }
case ISD::INTRINSIC_W_CHAIN: {
if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() !=
Intrinsic::ppc_is_decremented_ctr_nonzero)
// Other Lowering Code
//===----------------------------------------------------------------------===//
+static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) {
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ Function *Func = Intrinsic::getDeclaration(M, Id);
+ return Builder.CreateCall(Func);
+}
+
+// The mappings for emitLeading/TrailingFence is taken from
+// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
+Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+ AtomicOrdering Ord, bool IsStore,
+ bool IsLoad) const {
+ if (Ord == SequentiallyConsistent)
+ return callIntrinsic(Builder, Intrinsic::ppc_sync);
+ else if (isAtLeastRelease(Ord))
+ return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
+ else
+ return nullptr;
+}
+
+Instruction* PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+ AtomicOrdering Ord, bool IsStore,
+ bool IsLoad) const {
+ if (IsLoad && isAtLeastAcquire(Ord))
+ return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
+ // FIXME: this is too conservative, a dependent branch + isync is enough.
+ // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
+ // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
+ // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification.
+ else
+ return nullptr;
+}
+
MachineBasicBlock *
PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
bool is64bit, unsigned BinOpcode) const {
// This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ const TargetInstrInfo *TII =
+ getTargetMachine().getSubtargetImpl()->getInstrInfo();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
MachineFunction *F = BB->getParent();
MachineRegisterInfo &RegInfo = F->getRegInfo();
unsigned TmpReg = (!BinOpcode) ? incr :
- RegInfo.createVirtualRegister(
- is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass :
- (const TargetRegisterClass *) &PPC::GPRCRegClass);
+ RegInfo.createVirtualRegister( is64bit ? &PPC::G8RCRegClass
+ : &PPC::GPRCRegClass);
// thisMBB:
// ...
bool is8bit, // operation
unsigned BinOpcode) const {
// This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ const TargetInstrInfo *TII =
+ getTargetMachine().getSubtargetImpl()->getInstrInfo();
// In 64 bit mode we have to use 64 bits for addresses, even though the
// lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address
// registers without caring whether they're 32 or 64, but here we're
exitMBB->transferSuccessorsAndUpdatePHIs(BB);
MachineRegisterInfo &RegInfo = F->getRegInfo();
- const TargetRegisterClass *RC =
- is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass :
- (const TargetRegisterClass *) &PPC::GPRCRegClass;
+ const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass
+ : &PPC::GPRCRegClass;
unsigned PtrReg = RegInfo.createVirtualRegister(RC);
unsigned Shift1Reg = RegInfo.createVirtualRegister(RC);
unsigned ShiftReg = RegInfo.createVirtualRegister(RC);
PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
MachineBasicBlock *MBB) const {
DebugLoc DL = MI->getDebugLoc();
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ const TargetInstrInfo *TII =
+ getTargetMachine().getSubtargetImpl()->getInstrInfo();
MachineFunction *MF = MBB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
// Setup
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB);
const PPCRegisterInfo *TRI =
- static_cast<const PPCRegisterInfo*>(getTargetMachine().getRegisterInfo());
+ getTargetMachine().getSubtarget<PPCSubtarget>().getRegisterInfo();
MIB.addRegMask(TRI->getNoPreservedMask());
BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1);
PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
MachineBasicBlock *MBB) const {
DebugLoc DL = MI->getDebugLoc();
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ const TargetInstrInfo *TII =
+ getTargetMachine().getSubtargetImpl()->getInstrInfo();
MachineFunction *MF = MBB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
// Since FP is only updated here but NOT referenced, it's treated as GPR.
unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31;
unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1;
- unsigned BP = (PVT == MVT::i64) ? PPC::X30 : PPC::R30;
+ unsigned BP = (PVT == MVT::i64) ? PPC::X30 :
+ (Subtarget.isSVR4ABI() &&
+ MF->getTarget().getRelocationModel() == Reloc::PIC_ ?
+ PPC::R29 : PPC::R30);
MachineInstrBuilder MIB;
MachineBasicBlock *
PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
MachineBasicBlock *BB) const {
+ if (MI->getOpcode() == TargetOpcode::STACKMAP ||
+ MI->getOpcode() == TargetOpcode::PATCHPOINT)
+ return emitPatchPoint(MI, BB);
+
if (MI->getOpcode() == PPC::EH_SjLj_SetJmp32 ||
MI->getOpcode() == PPC::EH_SjLj_SetJmp64) {
return emitEHSjLjSetJmp(MI, BB);
return emitEHSjLjLongJmp(MI, BB);
}
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ const TargetInstrInfo *TII =
+ getTargetMachine().getSubtargetImpl()->getInstrInfo();
// To "insert" these instructions we actually have to insert their
// control-flow patterns.
Cond.push_back(MI->getOperand(1));
DebugLoc dl = MI->getDebugLoc();
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ const TargetInstrInfo *TII =
+ getTargetMachine().getSubtargetImpl()->getInstrInfo();
TII->insertSelect(*BB, MI, dl, MI->getOperand(0).getReg(),
Cond, MI->getOperand(2).getReg(),
MI->getOperand(3).getReg());
MI->getOpcode() == PPC::SELECT_CC_F4 ||
MI->getOpcode() == PPC::SELECT_CC_F8 ||
MI->getOpcode() == PPC::SELECT_CC_VRRC ||
+ MI->getOpcode() == PPC::SELECT_CC_VSFRC ||
+ MI->getOpcode() == PPC::SELECT_CC_VSRC ||
MI->getOpcode() == PPC::SELECT_I4 ||
MI->getOpcode() == PPC::SELECT_I8 ||
MI->getOpcode() == PPC::SELECT_F4 ||
MI->getOpcode() == PPC::SELECT_F8 ||
- MI->getOpcode() == PPC::SELECT_VRRC) {
+ MI->getOpcode() == PPC::SELECT_VRRC ||
+ MI->getOpcode() == PPC::SELECT_VSFRC ||
+ MI->getOpcode() == PPC::SELECT_VSRC) {
// The incoming instruction knows the destination vreg to set, the
// condition code register to branch on, the true/false values to
// select between, and a branch opcode to use.
MI->getOpcode() == PPC::SELECT_I8 ||
MI->getOpcode() == PPC::SELECT_F4 ||
MI->getOpcode() == PPC::SELECT_F8 ||
- MI->getOpcode() == PPC::SELECT_VRRC) {
+ MI->getOpcode() == PPC::SELECT_VRRC ||
+ MI->getOpcode() == PPC::SELECT_VSFRC ||
+ MI->getOpcode() == PPC::SELECT_VSRC) {
BuildMI(BB, dl, TII->get(PPC::BC))
.addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
} else {
TII->get(PPC::PHI), MI->getOperand(0).getReg())
.addReg(MI->getOperand(3).getReg()).addMBB(copy0MBB)
.addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+ } else if (MI->getOpcode() == PPC::ReadTB) {
+ // To read the 64-bit time-base register on a 32-bit target, we read the
+ // two halves. Should the counter have wrapped while it was being read, we
+ // need to try again.
+ // ...
+ // readLoop:
+ // mfspr Rx,TBU # load from TBU
+ // mfspr Ry,TB # load from TB
+ // mfspr Rz,TBU # load from TBU
+ // cmpw crX,Rx,Rz # check if ‘old’=’new’
+ // bne readLoop # branch if they're not equal
+ // ...
+
+ MachineBasicBlock *readMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ DebugLoc dl = MI->getDebugLoc();
+ F->insert(It, readMBB);
+ F->insert(It, sinkMBB);
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ BB->addSuccessor(readMBB);
+ BB = readMBB;
+
+ MachineRegisterInfo &RegInfo = F->getRegInfo();
+ unsigned ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
+ unsigned LoReg = MI->getOperand(0).getReg();
+ unsigned HiReg = MI->getOperand(1).getReg();
+
+ BuildMI(BB, dl, TII->get(PPC::MFSPR), HiReg).addImm(269);
+ BuildMI(BB, dl, TII->get(PPC::MFSPR), LoReg).addImm(268);
+ BuildMI(BB, dl, TII->get(PPC::MFSPR), ReadAgainReg).addImm(269);
+
+ unsigned CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
+
+ BuildMI(BB, dl, TII->get(PPC::CMPW), CmpReg)
+ .addReg(HiReg).addReg(ReadAgainReg);
+ BuildMI(BB, dl, TII->get(PPC::BCC))
+ .addImm(PPC::PRED_NE).addReg(CmpReg).addMBB(readMBB);
+
+ BB->addSuccessor(readMBB);
+ BB->addSuccessor(sinkMBB);
}
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I8)
BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4);
BB = EmitAtomicBinary(MI, BB, true, PPC::XOR8);
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I8)
- BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ANDC);
+ BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND);
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I16)
- BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ANDC);
+ BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND);
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I32)
- BB = EmitAtomicBinary(MI, BB, false, PPC::ANDC);
+ BB = EmitAtomicBinary(MI, BB, false, PPC::NAND);
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I64)
- BB = EmitAtomicBinary(MI, BB, true, PPC::ANDC8);
+ BB = EmitAtomicBinary(MI, BB, true, PPC::NAND8);
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I8)
BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF);
exitMBB->transferSuccessorsAndUpdatePHIs(BB);
MachineRegisterInfo &RegInfo = F->getRegInfo();
- const TargetRegisterClass *RC =
- is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass :
- (const TargetRegisterClass *) &PPC::GPRCRegClass;
+ const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass
+ : &PPC::GPRCRegClass;
unsigned PtrReg = RegInfo.createVirtualRegister(RC);
unsigned Shift1Reg = RegInfo.createVirtualRegister(RC);
unsigned ShiftReg = RegInfo.createVirtualRegister(RC);
// Target Optimization Hooks
//===----------------------------------------------------------------------===//
-SDValue PPCTargetLowering::DAGCombineFastRecip(SDValue Op,
- DAGCombinerInfo &DCI) const {
- if (DCI.isAfterLegalizeVectorOps())
- return SDValue();
-
- EVT VT = Op.getValueType();
-
- if ((VT == MVT::f32 && Subtarget.hasFRES()) ||
- (VT == MVT::f64 && Subtarget.hasFRE()) ||
+SDValue PPCTargetLowering::getRsqrtEstimate(SDValue Operand,
+ DAGCombinerInfo &DCI,
+ unsigned &RefinementSteps,
+ bool &UseOneConstNR) const {
+ EVT VT = Operand.getValueType();
+ if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||
+ (VT == MVT::f64 && Subtarget.hasFRSQRTE()) ||
(VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
(VT == MVT::v2f64 && Subtarget.hasVSX())) {
-
- // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
- // For the reciprocal, we need to find the zero of the function:
- // F(X) = A X - 1 [which has a zero at X = 1/A]
- // =>
- // X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
- // does not require additional intermediate precision]
-
// Convergence is quadratic, so we essentially double the number of digits
- // correct after every iteration. The minimum architected relative
- // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has
- // 23 digits and double has 52 digits.
- int Iterations = Subtarget.hasRecipPrec() ? 1 : 3;
+ // correct after every iteration. For both FRE and FRSQRTE, the minimum
+ // architected relative accuracy is 2^-5. When hasRecipPrec(), this is
+ // 2^-14. IEEE float has 23 digits and double has 52 digits.
+ RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3;
if (VT.getScalarType() == MVT::f64)
- ++Iterations;
-
- SelectionDAG &DAG = DCI.DAG;
- SDLoc dl(Op);
-
- SDValue FPOne =
- DAG.getConstantFP(1.0, VT.getScalarType());
- if (VT.isVector()) {
- assert(VT.getVectorNumElements() == 4 &&
- "Unknown vector type");
- FPOne = DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
- FPOne, FPOne, FPOne, FPOne);
- }
-
- SDValue Est = DAG.getNode(PPCISD::FRE, dl, VT, Op);
- DCI.AddToWorklist(Est.getNode());
-
- // Newton iterations: Est = Est + Est (1 - Arg * Est)
- for (int i = 0; i < Iterations; ++i) {
- SDValue NewEst = DAG.getNode(ISD::FMUL, dl, VT, Op, Est);
- DCI.AddToWorklist(NewEst.getNode());
-
- NewEst = DAG.getNode(ISD::FSUB, dl, VT, FPOne, NewEst);
- DCI.AddToWorklist(NewEst.getNode());
-
- NewEst = DAG.getNode(ISD::FMUL, dl, VT, Est, NewEst);
- DCI.AddToWorklist(NewEst.getNode());
-
- Est = DAG.getNode(ISD::FADD, dl, VT, Est, NewEst);
- DCI.AddToWorklist(Est.getNode());
- }
-
- return Est;
+ ++RefinementSteps;
+ UseOneConstNR = true;
+ return DCI.DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand);
}
-
return SDValue();
}
-SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op,
- DAGCombinerInfo &DCI) const {
- if (DCI.isAfterLegalizeVectorOps())
- return SDValue();
-
- EVT VT = Op.getValueType();
-
- if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||
- (VT == MVT::f64 && Subtarget.hasFRSQRTE()) ||
+SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand,
+ DAGCombinerInfo &DCI,
+ unsigned &RefinementSteps) const {
+ EVT VT = Operand.getValueType();
+ if ((VT == MVT::f32 && Subtarget.hasFRES()) ||
+ (VT == MVT::f64 && Subtarget.hasFRE()) ||
(VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
(VT == MVT::v2f64 && Subtarget.hasVSX())) {
-
- // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
- // For the reciprocal sqrt, we need to find the zero of the function:
- // F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
- // =>
- // X_{i+1} = X_i (1.5 - A X_i^2 / 2)
- // As a result, we precompute A/2 prior to the iteration loop.
-
// Convergence is quadratic, so we essentially double the number of digits
- // correct after every iteration. The minimum architected relative
- // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has
- // 23 digits and double has 52 digits.
- int Iterations = Subtarget.hasRecipPrec() ? 1 : 3;
+ // correct after every iteration. For both FRE and FRSQRTE, the minimum
+ // architected relative accuracy is 2^-5. When hasRecipPrec(), this is
+ // 2^-14. IEEE float has 23 digits and double has 52 digits.
+ RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3;
if (VT.getScalarType() == MVT::f64)
- ++Iterations;
-
- SelectionDAG &DAG = DCI.DAG;
- SDLoc dl(Op);
-
- SDValue FPThreeHalves =
- DAG.getConstantFP(1.5, VT.getScalarType());
- if (VT.isVector()) {
- assert(VT.getVectorNumElements() == 4 &&
- "Unknown vector type");
- FPThreeHalves = DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
- FPThreeHalves, FPThreeHalves,
- FPThreeHalves, FPThreeHalves);
- }
-
- SDValue Est = DAG.getNode(PPCISD::FRSQRTE, dl, VT, Op);
- DCI.AddToWorklist(Est.getNode());
-
- // We now need 0.5*Arg which we can write as (1.5*Arg - Arg) so that
- // this entire sequence requires only one FP constant.
- SDValue HalfArg = DAG.getNode(ISD::FMUL, dl, VT, FPThreeHalves, Op);
- DCI.AddToWorklist(HalfArg.getNode());
-
- HalfArg = DAG.getNode(ISD::FSUB, dl, VT, HalfArg, Op);
- DCI.AddToWorklist(HalfArg.getNode());
-
- // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
- for (int i = 0; i < Iterations; ++i) {
- SDValue NewEst = DAG.getNode(ISD::FMUL, dl, VT, Est, Est);
- DCI.AddToWorklist(NewEst.getNode());
-
- NewEst = DAG.getNode(ISD::FMUL, dl, VT, HalfArg, NewEst);
- DCI.AddToWorklist(NewEst.getNode());
+ ++RefinementSteps;
+ return DCI.DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand);
+ }
+ return SDValue();
+}
- NewEst = DAG.getNode(ISD::FSUB, dl, VT, FPThreeHalves, NewEst);
- DCI.AddToWorklist(NewEst.getNode());
+bool PPCTargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const {
+ // Note: This functionality is used only when unsafe-fp-math is enabled, and
+ // on cores with reciprocal estimates (which are used when unsafe-fp-math is
+ // enabled for division), this functionality is redundant with the default
+ // combiner logic (once the division -> reciprocal/multiply transformation
+ // has taken place). As a result, this matters more for older cores than for
+ // newer ones.
- Est = DAG.getNode(ISD::FMUL, dl, VT, Est, NewEst);
- DCI.AddToWorklist(Est.getNode());
- }
-
- return Est;
+ // Combine multiple FDIVs with the same divisor into multiple FMULs by the
+ // reciprocal if there are two or more FDIVs (for embedded cores with only
+ // one FP pipeline) for three or more FDIVs (for generic OOO cores).
+ switch (Subtarget.getDarwinDirective()) {
+ default:
+ return NumUsers > 2;
+ case PPC::DIR_440:
+ case PPC::DIR_A2:
+ case PPC::DIR_E500mc:
+ case PPC::DIR_E5500:
+ return NumUsers > 1;
}
-
- return SDValue();
}
-// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does
-// not enforce equality of the chain operands.
-static bool isConsecutiveLS(LSBaseSDNode *LS, LSBaseSDNode *Base,
+static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base,
unsigned Bytes, int Dist,
SelectionDAG &DAG) {
- EVT VT = LS->getMemoryVT();
if (VT.getSizeInBits() / 8 != Bytes)
return false;
- SDValue Loc = LS->getBasePtr();
SDValue BaseLoc = Base->getBasePtr();
if (Loc.getOpcode() == ISD::FrameIndex) {
if (BaseLoc.getOpcode() != ISD::FrameIndex)
return false;
}
+// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does
+// not enforce equality of the chain operands.
+static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
+ unsigned Bytes, int Dist,
+ SelectionDAG &DAG) {
+ if (LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(N)) {
+ EVT VT = LS->getMemoryVT();
+ SDValue Loc = LS->getBasePtr();
+ return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG);
+ }
+
+ if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
+ EVT VT;
+ switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+ default: return false;
+ case Intrinsic::ppc_altivec_lvx:
+ case Intrinsic::ppc_altivec_lvxl:
+ case Intrinsic::ppc_vsx_lxvw4x:
+ VT = MVT::v4i32;
+ break;
+ case Intrinsic::ppc_vsx_lxvd2x:
+ VT = MVT::v2f64;
+ break;
+ case Intrinsic::ppc_altivec_lvebx:
+ VT = MVT::i8;
+ break;
+ case Intrinsic::ppc_altivec_lvehx:
+ VT = MVT::i16;
+ break;
+ case Intrinsic::ppc_altivec_lvewx:
+ VT = MVT::i32;
+ break;
+ }
+
+ return isConsecutiveLSLoc(N->getOperand(2), VT, Base, Bytes, Dist, DAG);
+ }
+
+ if (N->getOpcode() == ISD::INTRINSIC_VOID) {
+ EVT VT;
+ switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+ default: return false;
+ case Intrinsic::ppc_altivec_stvx:
+ case Intrinsic::ppc_altivec_stvxl:
+ case Intrinsic::ppc_vsx_stxvw4x:
+ VT = MVT::v4i32;
+ break;
+ case Intrinsic::ppc_vsx_stxvd2x:
+ VT = MVT::v2f64;
+ break;
+ case Intrinsic::ppc_altivec_stvebx:
+ VT = MVT::i8;
+ break;
+ case Intrinsic::ppc_altivec_stvehx:
+ VT = MVT::i16;
+ break;
+ case Intrinsic::ppc_altivec_stvewx:
+ VT = MVT::i32;
+ break;
+ }
+
+ return isConsecutiveLSLoc(N->getOperand(3), VT, Base, Bytes, Dist, DAG);
+ }
+
+ return false;
+}
+
// Return true is there is a nearyby consecutive load to the one provided
// (regardless of alignment). We search up and down the chain, looking though
-// token factors and other loads (but nothing else). As a result, a true
-// results indicates that it is safe to create a new consecutive load adjacent
-// to the load provided.
+// token factors and other loads (but nothing else). As a result, a true result
+// indicates that it is safe to create a new consecutive load adjacent to the
+// load provided.
static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
SDValue Chain = LD->getChain();
EVT VT = LD->getMemoryVT();
// nodes just above the top-level loads and token factors.
while (!Queue.empty()) {
SDNode *ChainNext = Queue.pop_back_val();
- if (!Visited.insert(ChainNext))
+ if (!Visited.insert(ChainNext).second)
continue;
- if (LoadSDNode *ChainLD = dyn_cast<LoadSDNode>(ChainNext)) {
+ if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(ChainNext)) {
if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
return true;
if (!Visited.count(ChainLD->getChain().getNode()))
Queue.push_back(ChainLD->getChain().getNode());
} else if (ChainNext->getOpcode() == ISD::TokenFactor) {
- for (SDNode::op_iterator O = ChainNext->op_begin(),
- OE = ChainNext->op_end(); O != OE; ++O)
- if (!Visited.count(O->getNode()))
- Queue.push_back(O->getNode());
+ for (const SDUse &O : ChainNext->ops())
+ if (!Visited.count(O.getNode()))
+ Queue.push_back(O.getNode());
} else
LoadRoots.insert(ChainNext);
}
while (!Queue.empty()) {
SDNode *LoadRoot = Queue.pop_back_val();
- if (!Visited.insert(LoadRoot))
+ if (!Visited.insert(LoadRoot).second)
continue;
- if (LoadSDNode *ChainLD = dyn_cast<LoadSDNode>(LoadRoot))
+ if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(LoadRoot))
if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
return true;
for (SDNode::use_iterator UI = LoadRoot->use_begin(),
UE = LoadRoot->use_end(); UI != UE; ++UI)
- if (((isa<LoadSDNode>(*UI) &&
- cast<LoadSDNode>(*UI)->getChain().getNode() == LoadRoot) ||
+ if (((isa<MemSDNode>(*UI) &&
+ cast<MemSDNode>(*UI)->getChain().getNode() == LoadRoot) ||
UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI))
Queue.push_back(*UI);
}
SDValue BinOp = BinOps.back();
BinOps.pop_back();
- if (!Visited.insert(BinOp.getNode()))
+ if (!Visited.insert(BinOp.getNode()).second)
continue;
PromOps.push_back(BinOp);
SDValue BinOp = BinOps.back();
BinOps.pop_back();
- if (!Visited.insert(BinOp.getNode()))
+ if (!Visited.insert(BinOp.getNode()).second)
continue;
PromOps.push_back(BinOp);
}
}
+ // The operands of a select that must be truncated when the select is
+ // promoted because the operand is actually part of the to-be-promoted set.
+ DenseMap<SDNode *, EVT> SelectTruncOp[2];
+
// Make sure that this is a self-contained cluster of operations (which
// is not quite the same thing as saying that everything has only one
// use).
if (User != N && !Visited.count(User))
return SDValue();
- // Make sure that we're not going to promote the non-output-value
- // operand(s) or SELECT or SELECT_CC.
- // FIXME: Although we could sometimes handle this, and it does occur in
- // practice that one of the condition inputs to the select is also one of
- // the outputs, we currently can't deal with this.
+ // If we're going to promote the non-output-value operand(s) or SELECT or
+ // SELECT_CC, record them for truncation.
if (User->getOpcode() == ISD::SELECT) {
if (User->getOperand(0) == Inputs[i])
- return SDValue();
+ SelectTruncOp[0].insert(std::make_pair(User,
+ User->getOperand(0).getValueType()));
} else if (User->getOpcode() == ISD::SELECT_CC) {
- if (User->getOperand(0) == Inputs[i] ||
- User->getOperand(1) == Inputs[i])
- return SDValue();
+ if (User->getOperand(0) == Inputs[i])
+ SelectTruncOp[0].insert(std::make_pair(User,
+ User->getOperand(0).getValueType()));
+ if (User->getOperand(1) == Inputs[i])
+ SelectTruncOp[1].insert(std::make_pair(User,
+ User->getOperand(1).getValueType()));
}
}
}
if (User != N && !Visited.count(User))
return SDValue();
- // Make sure that we're not going to promote the non-output-value
- // operand(s) or SELECT or SELECT_CC.
- // FIXME: Although we could sometimes handle this, and it does occur in
- // practice that one of the condition inputs to the select is also one of
- // the outputs, we currently can't deal with this.
+ // If we're going to promote the non-output-value operand(s) or SELECT or
+ // SELECT_CC, record them for truncation.
if (User->getOpcode() == ISD::SELECT) {
if (User->getOperand(0) == PromOps[i])
- return SDValue();
+ SelectTruncOp[0].insert(std::make_pair(User,
+ User->getOperand(0).getValueType()));
} else if (User->getOpcode() == ISD::SELECT_CC) {
- if (User->getOperand(0) == PromOps[i] ||
- User->getOperand(1) == PromOps[i])
- return SDValue();
+ if (User->getOperand(0) == PromOps[i])
+ SelectTruncOp[0].insert(std::make_pair(User,
+ User->getOperand(0).getValueType()));
+ if (User->getOperand(1) == PromOps[i])
+ SelectTruncOp[1].insert(std::make_pair(User,
+ User->getOperand(1).getValueType()));
}
}
}
continue;
}
+ // For SELECT and SELECT_CC nodes, we do a similar check for any
+ // to-be-promoted comparison inputs.
+ if (PromOp.getOpcode() == ISD::SELECT ||
+ PromOp.getOpcode() == ISD::SELECT_CC) {
+ if ((SelectTruncOp[0].count(PromOp.getNode()) &&
+ PromOp.getOperand(0).getValueType() != N->getValueType(0)) ||
+ (SelectTruncOp[1].count(PromOp.getNode()) &&
+ PromOp.getOperand(1).getValueType() != N->getValueType(0))) {
+ PromOps.insert(PromOps.begin(), PromOp);
+ continue;
+ }
+ }
+
SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
PromOp.getNode()->op_end());
Ops[C+i] = DAG.getAnyExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
}
+ // If we've promoted the comparison inputs of a SELECT or SELECT_CC,
+ // truncate them again to the original value type.
+ if (PromOp.getOpcode() == ISD::SELECT ||
+ PromOp.getOpcode() == ISD::SELECT_CC) {
+ auto SI0 = SelectTruncOp[0].find(PromOp.getNode());
+ if (SI0 != SelectTruncOp[0].end())
+ Ops[0] = DAG.getNode(ISD::TRUNCATE, dl, SI0->second, Ops[0]);
+ auto SI1 = SelectTruncOp[1].find(PromOp.getNode());
+ if (SI1 != SelectTruncOp[1].end())
+ Ops[1] = DAG.getNode(ISD::TRUNCATE, dl, SI1->second, Ops[1]);
+ }
+
DAG.ReplaceAllUsesOfValueWith(PromOp,
DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0), Ops));
}
N->getOperand(0), ShiftCst), ShiftCst);
}
+SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ assert((N->getOpcode() == ISD::SINT_TO_FP ||
+ N->getOpcode() == ISD::UINT_TO_FP) &&
+ "Need an int -> FP conversion node here");
+
+ if (!Subtarget.has64BitSupport())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc dl(N);
+ SDValue Op(N, 0);
+
+ // Don't handle ppc_fp128 here or i1 conversions.
+ if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
+ return SDValue();
+ if (Op.getOperand(0).getValueType() == MVT::i1)
+ return SDValue();
+
+ // For i32 intermediate values, unfortunately, the conversion functions
+ // leave the upper 32 bits of the value are undefined. Within the set of
+ // scalar instructions, we have no method for zero- or sign-extending the
+ // value. Thus, we cannot handle i32 intermediate values here.
+ if (Op.getOperand(0).getValueType() == MVT::i32)
+ return SDValue();
+
+ assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) &&
+ "UINT_TO_FP is supported only with FPCVT");
+
+ // If we have FCFIDS, then use it when converting to single-precision.
+ // Otherwise, convert to double-precision and then round.
+ unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ?
+ (Op.getOpcode() == ISD::UINT_TO_FP ?
+ PPCISD::FCFIDUS : PPCISD::FCFIDS) :
+ (Op.getOpcode() == ISD::UINT_TO_FP ?
+ PPCISD::FCFIDU : PPCISD::FCFID);
+ MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ?
+ MVT::f32 : MVT::f64;
+
+ // If we're converting from a float, to an int, and back to a float again,
+ // then we don't need the store/load pair at all.
+ if ((Op.getOperand(0).getOpcode() == ISD::FP_TO_UINT &&
+ Subtarget.hasFPCVT()) ||
+ (Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT)) {
+ SDValue Src = Op.getOperand(0).getOperand(0);
+ if (Src.getValueType() == MVT::f32) {
+ Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
+ DCI.AddToWorklist(Src.getNode());
+ }
+
+ unsigned FCTOp =
+ Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
+ PPCISD::FCTIDUZ;
+
+ SDValue Tmp = DAG.getNode(FCTOp, dl, MVT::f64, Src);
+ SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Tmp);
+
+ if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
+ FP = DAG.getNode(ISD::FP_ROUND, dl,
+ MVT::f32, FP, DAG.getIntPtrConstant(0));
+ DCI.AddToWorklist(FP.getNode());
+ }
+
+ return FP;
+ }
+
+ return SDValue();
+}
+
+// expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for
+// builtins) into loads with swaps.
+SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc dl(N);
+ SDValue Chain;
+ SDValue Base;
+ MachineMemOperand *MMO;
+
+ switch (N->getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode for little endian VSX load");
+ case ISD::LOAD: {
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ Chain = LD->getChain();
+ Base = LD->getBasePtr();
+ MMO = LD->getMemOperand();
+ // If the MMO suggests this isn't a load of a full vector, leave
+ // things alone. For a built-in, we have to make the change for
+ // correctness, so if there is a size problem that will be a bug.
+ if (MMO->getSize() < 16)
+ return SDValue();
+ break;
+ }
+ case ISD::INTRINSIC_W_CHAIN: {
+ MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);
+ Chain = Intrin->getChain();
+ Base = Intrin->getBasePtr();
+ MMO = Intrin->getMemOperand();
+ break;
+ }
+ }
+
+ MVT VecTy = N->getValueType(0).getSimpleVT();
+ SDValue LoadOps[] = { Chain, Base };
+ SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl,
+ DAG.getVTList(VecTy, MVT::Other),
+ LoadOps, VecTy, MMO);
+ DCI.AddToWorklist(Load.getNode());
+ Chain = Load.getValue(1);
+ SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl,
+ DAG.getVTList(VecTy, MVT::Other), Chain, Load);
+ DCI.AddToWorklist(Swap.getNode());
+ return Swap;
+}
+
+// expandVSXStoreForLE - Convert VSX stores (which may be intrinsics for
+// builtins) into stores with swaps.
+SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc dl(N);
+ SDValue Chain;
+ SDValue Base;
+ unsigned SrcOpnd;
+ MachineMemOperand *MMO;
+
+ switch (N->getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode for little endian VSX store");
+ case ISD::STORE: {
+ StoreSDNode *ST = cast<StoreSDNode>(N);
+ Chain = ST->getChain();
+ Base = ST->getBasePtr();
+ MMO = ST->getMemOperand();
+ SrcOpnd = 1;
+ // If the MMO suggests this isn't a store of a full vector, leave
+ // things alone. For a built-in, we have to make the change for
+ // correctness, so if there is a size problem that will be a bug.
+ if (MMO->getSize() < 16)
+ return SDValue();
+ break;
+ }
+ case ISD::INTRINSIC_VOID: {
+ MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);
+ Chain = Intrin->getChain();
+ // Intrin->getBasePtr() oddly does not get what we want.
+ Base = Intrin->getOperand(3);
+ MMO = Intrin->getMemOperand();
+ SrcOpnd = 2;
+ break;
+ }
+ }
+
+ SDValue Src = N->getOperand(SrcOpnd);
+ MVT VecTy = Src.getValueType().getSimpleVT();
+ SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl,
+ DAG.getVTList(VecTy, MVT::Other), Chain, Src);
+ DCI.AddToWorklist(Swap.getNode());
+ Chain = Swap.getValue(1);
+ SDValue StoreOps[] = { Chain, Swap, Base };
+ SDValue Store = DAG.getMemIntrinsicNode(PPCISD::STXVD2X, dl,
+ DAG.getVTList(MVT::Other),
+ StoreOps, VecTy, MMO);
+ DCI.AddToWorklist(Store.getNode());
+ return Store;
+}
+
SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
const TargetMachine &TM = getTargetMachine();
case ISD::SETCC:
case ISD::SELECT_CC:
return DAGCombineTruncBoolExt(N, DCI);
- case ISD::FDIV: {
- assert(TM.Options.UnsafeFPMath &&
- "Reciprocal estimates require UnsafeFPMath");
-
- if (N->getOperand(1).getOpcode() == ISD::FSQRT) {
- SDValue RV =
- DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0), DCI);
- if (RV.getNode()) {
- DCI.AddToWorklist(RV.getNode());
- return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
- N->getOperand(0), RV);
- }
- } else if (N->getOperand(1).getOpcode() == ISD::FP_EXTEND &&
- N->getOperand(1).getOperand(0).getOpcode() == ISD::FSQRT) {
- SDValue RV =
- DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0).getOperand(0),
- DCI);
- if (RV.getNode()) {
- DCI.AddToWorklist(RV.getNode());
- RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N->getOperand(1)),
- N->getValueType(0), RV);
- DCI.AddToWorklist(RV.getNode());
- return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
- N->getOperand(0), RV);
- }
- } else if (N->getOperand(1).getOpcode() == ISD::FP_ROUND &&
- N->getOperand(1).getOperand(0).getOpcode() == ISD::FSQRT) {
- SDValue RV =
- DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0).getOperand(0),
- DCI);
- if (RV.getNode()) {
- DCI.AddToWorklist(RV.getNode());
- RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N->getOperand(1)),
- N->getValueType(0), RV,
- N->getOperand(1).getOperand(1));
- DCI.AddToWorklist(RV.getNode());
- return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
- N->getOperand(0), RV);
- }
- }
-
- SDValue RV = DAGCombineFastRecip(N->getOperand(1), DCI);
- if (RV.getNode()) {
- DCI.AddToWorklist(RV.getNode());
- return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
- N->getOperand(0), RV);
- }
-
- }
- break;
- case ISD::FSQRT: {
- assert(TM.Options.UnsafeFPMath &&
- "Reciprocal estimates require UnsafeFPMath");
-
- // Compute this as 1/(1/sqrt(X)), which is the reciprocal of the
- // reciprocal sqrt.
- SDValue RV = DAGCombineFastRecipFSQRT(N->getOperand(0), DCI);
- if (RV.getNode()) {
- DCI.AddToWorklist(RV.getNode());
- RV = DAGCombineFastRecip(RV, DCI);
- if (RV.getNode()) {
- // Unfortunately, RV is now NaN if the input was exactly 0. Select out
- // this case and force the answer to 0.
-
- EVT VT = RV.getValueType();
-
- SDValue Zero = DAG.getConstantFP(0.0, VT.getScalarType());
- if (VT.isVector()) {
- assert(VT.getVectorNumElements() == 4 && "Unknown vector type");
- Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Zero, Zero, Zero, Zero);
- }
-
- SDValue ZeroCmp =
- DAG.getSetCC(dl, getSetCCResultType(*DAG.getContext(), VT),
- N->getOperand(0), Zero, ISD::SETEQ);
- DCI.AddToWorklist(ZeroCmp.getNode());
- DCI.AddToWorklist(RV.getNode());
-
- RV = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, dl, VT,
- ZeroCmp, Zero, RV);
- return RV;
- }
- }
-
- }
- break;
case ISD::SINT_TO_FP:
- if (TM.getSubtarget<PPCSubtarget>().has64BitSupport()) {
- if (N->getOperand(0).getOpcode() == ISD::FP_TO_SINT) {
- // Turn (sint_to_fp (fp_to_sint X)) -> fctidz/fcfid without load/stores.
- // We allow the src/dst to be either f32/f64, but the intermediate
- // type must be i64.
- if (N->getOperand(0).getValueType() == MVT::i64 &&
- N->getOperand(0).getOperand(0).getValueType() != MVT::ppcf128) {
- SDValue Val = N->getOperand(0).getOperand(0);
- if (Val.getValueType() == MVT::f32) {
- Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val);
- DCI.AddToWorklist(Val.getNode());
- }
-
- Val = DAG.getNode(PPCISD::FCTIDZ, dl, MVT::f64, Val);
- DCI.AddToWorklist(Val.getNode());
- Val = DAG.getNode(PPCISD::FCFID, dl, MVT::f64, Val);
- DCI.AddToWorklist(Val.getNode());
- if (N->getValueType(0) == MVT::f32) {
- Val = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Val,
- DAG.getIntPtrConstant(0));
- DCI.AddToWorklist(Val.getNode());
- }
- return Val;
- } else if (N->getOperand(0).getValueType() == MVT::i32) {
- // If the intermediate type is i32, we can avoid the load/store here
- // too.
- }
- }
- }
- break;
- case ISD::STORE:
+ case ISD::UINT_TO_FP:
+ return combineFPToIntToFP(N, DCI);
+ case ISD::STORE: {
// Turn STORE (FP_TO_SINT F) -> STFIWX(FCTIWZ(F)).
if (TM.getSubtarget<PPCSubtarget>().hasSTFIWX() &&
!cast<StoreSDNode>(N)->isTruncatingStore() &&
Ops, cast<StoreSDNode>(N)->getMemoryVT(),
cast<StoreSDNode>(N)->getMemOperand());
}
+
+ // For little endian, VSX stores require generating xxswapd/lxvd2x.
+ EVT VT = N->getOperand(1).getValueType();
+ if (VT.isSimple()) {
+ MVT StoreVT = VT.getSimpleVT();
+ if (TM.getSubtarget<PPCSubtarget>().hasVSX() &&
+ TM.getSubtarget<PPCSubtarget>().isLittleEndian() &&
+ (StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 ||
+ StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32))
+ return expandVSXStoreForLE(N, DCI);
+ }
break;
+ }
case ISD::LOAD: {
LoadSDNode *LD = cast<LoadSDNode>(N);
EVT VT = LD->getValueType(0);
+
+ // For little endian, VSX loads require generating lxvd2x/xxswapd.
+ if (VT.isSimple()) {
+ MVT LoadVT = VT.getSimpleVT();
+ if (TM.getSubtarget<PPCSubtarget>().hasVSX() &&
+ TM.getSubtarget<PPCSubtarget>().isLittleEndian() &&
+ (LoadVT == MVT::v2f64 || LoadVT == MVT::v2i64 ||
+ LoadVT == MVT::v4f32 || LoadVT == MVT::v4i32))
+ return expandVSXLoadForLE(N, DCI);
+ }
+
Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty);
if (ISD::isNON_EXTLoad(N) && VT.isVector() &&
TM.getSubtarget<PPCSubtarget>().hasAltivec() &&
+ // P8 and later hardware should just use LOAD.
+ !TM.getSubtarget<PPCSubtarget>().hasP8Vector() &&
(VT == MVT::v16i8 || VT == MVT::v8i16 ||
VT == MVT::v4i32 || VT == MVT::v4f32) &&
LD->getAlignment() < ABIAlignment) {
Intrinsic::ppc_altivec_lvsl);
SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, MVT::v16i8);
- // Refine the alignment of the original load (a "new" load created here
- // which was identical to the first except for the alignment would be
- // merged with the existing node regardless).
+ // Create the new MMO for the new base load. It is like the original MMO,
+ // but represents an area in memory almost twice the vector size centered
+ // on the original address. If the address is unaligned, we might start
+ // reading up to (sizeof(vector)-1) bytes below the address of the
+ // original unaligned load.
MachineFunction &MF = DAG.getMachineFunction();
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(LD->getPointerInfo(),
- LD->getMemOperand()->getFlags(),
- LD->getMemoryVT().getStoreSize(),
- ABIAlignment);
- LD->refineAlignment(MMO);
- SDValue BaseLoad = SDValue(LD, 0);
+ MachineMemOperand *BaseMMO =
+ MF.getMachineMemOperand(LD->getMemOperand(),
+ -LD->getMemoryVT().getStoreSize()+1,
+ 2*LD->getMemoryVT().getStoreSize()-1);
+
+ // Create the new base load.
+ SDValue LDXIntID = DAG.getTargetConstant(Intrinsic::ppc_altivec_lvx,
+ getPointerTy());
+ SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr };
+ SDValue BaseLoad =
+ DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
+ DAG.getVTList(MVT::v4i32, MVT::Other),
+ BaseLoadOps, MVT::v4i32, BaseMMO);
// Note that the value of IncOffset (which is provided to the next
// load's pointer info offset value, and thus used to calculate the
SDValue Increment = DAG.getConstant(IncValue, getPointerTy());
Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+ MachineMemOperand *ExtraMMO =
+ MF.getMachineMemOperand(LD->getMemOperand(),
+ 1, 2*LD->getMemoryVT().getStoreSize()-1);
+ SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr };
SDValue ExtraLoad =
- DAG.getLoad(VT, dl, Chain, Ptr,
- LD->getPointerInfo().getWithOffset(IncOffset),
- LD->isVolatile(), LD->isNonTemporal(),
- LD->isInvariant(), ABIAlignment);
+ DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
+ DAG.getVTList(MVT::v4i32, MVT::Other),
+ ExtraLoadOps, MVT::v4i32, ExtraMMO);
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
BaseLoad.getValue(1), ExtraLoad.getValue(1));
- if (BaseLoad.getValueType() != MVT::v4i32)
- BaseLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, BaseLoad);
-
- if (ExtraLoad.getValueType() != MVT::v4i32)
- ExtraLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ExtraLoad);
-
// Because vperm has a big-endian bias, we must reverse the order
// of the input vectors and complement the permute control vector
// when generating little endian code. We have already handled the
if (VT != MVT::v4i32)
Perm = DAG.getNode(ISD::BITCAST, dl, VT, Perm);
- // Now we need to be really careful about how we update the users of the
- // original load. We cannot just call DCI.CombineTo (or
- // DAG.ReplaceAllUsesWith for that matter), because the load still has
- // uses created here (the permutation for example) that need to stay.
- SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
- while (UI != UE) {
- SDUse &Use = UI.getUse();
- SDNode *User = *UI;
- // Note: BaseLoad is checked here because it might not be N, but a
- // bitcast of N.
- if (User == Perm.getNode() || User == BaseLoad.getNode() ||
- User == TF.getNode() || Use.getResNo() > 1) {
- ++UI;
- continue;
- }
-
- SDValue To = Use.getResNo() ? TF : Perm;
- ++UI;
-
- SmallVector<SDValue, 8> Ops;
- for (SDNode::op_iterator O = User->op_begin(),
- OE = User->op_end(); O != OE; ++O) {
- if (*O == Use)
- Ops.push_back(To);
- else
- Ops.push_back(*O);
- }
-
- DAG.UpdateNodeOperands(User, Ops);
- }
-
+ // The output of the permutation is our loaded result, the TokenFactor is
+ // our new chain.
+ DCI.CombineTo(N, Perm, TF);
return SDValue(N, 0);
}
}
}
break;
+ case ISD::INTRINSIC_W_CHAIN: {
+ // For little endian, VSX loads require generating lxvd2x/xxswapd.
+ if (TM.getSubtarget<PPCSubtarget>().hasVSX() &&
+ TM.getSubtarget<PPCSubtarget>().isLittleEndian()) {
+ switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+ default:
+ break;
+ case Intrinsic::ppc_vsx_lxvw4x:
+ case Intrinsic::ppc_vsx_lxvd2x:
+ return expandVSXLoadForLE(N, DCI);
+ }
+ }
+ break;
+ }
+ case ISD::INTRINSIC_VOID: {
+ // For little endian, VSX stores require generating xxswapd/stxvd2x.
+ if (TM.getSubtarget<PPCSubtarget>().hasVSX() &&
+ TM.getSubtarget<PPCSubtarget>().isLittleEndian()) {
+ switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+ default:
+ break;
+ case Intrinsic::ppc_vsx_stxvw4x:
+ case Intrinsic::ppc_vsx_stxvd2x:
+ return expandVSXStoreForLE(N, DCI);
+ }
+ }
+ break;
+ }
case ISD::BSWAP:
// Turn BSWAP (LOAD) -> lhbrx/lwbrx.
if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
return SDValue();
}
+SDValue
+PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
+ SelectionDAG &DAG,
+ std::vector<SDNode *> *Created) const {
+ // fold (sdiv X, pow2)
+ EVT VT = N->getValueType(0);
+ if (VT == MVT::i64 && !Subtarget.isPPC64())
+ return SDValue();
+ if ((VT != MVT::i32 && VT != MVT::i64) ||
+ !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue N0 = N->getOperand(0);
+
+ bool IsNegPow2 = (-Divisor).isPowerOf2();
+ unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countTrailingZeros();
+ SDValue ShiftAmt = DAG.getConstant(Lg2, VT);
+
+ SDValue Op = DAG.getNode(PPCISD::SRA_ADDZE, DL, VT, N0, ShiftAmt);
+ if (Created)
+ Created->push_back(Op.getNode());
+
+ if (IsNegPow2) {
+ Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), Op);
+ if (Created)
+ Created->push_back(Op.getNode());
+ }
+
+ return Op;
+}
+
//===----------------------------------------------------------------------===//
// Inline Assembly Support
//===----------------------------------------------------------------------===//
}
}
+unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
+ switch (Subtarget.getDarwinDirective()) {
+ default: break;
+ case PPC::DIR_970:
+ case PPC::DIR_PWR4:
+ case PPC::DIR_PWR5:
+ case PPC::DIR_PWR5X:
+ case PPC::DIR_PWR6:
+ case PPC::DIR_PWR6X:
+ case PPC::DIR_PWR7:
+ case PPC::DIR_PWR8: {
+ if (!ML)
+ break;
+
+ const PPCInstrInfo *TII =
+ static_cast<const PPCInstrInfo *>(getTargetMachine().getSubtargetImpl()->
+ getInstrInfo());
+
+ // For small loops (between 5 and 8 instructions), align to a 32-byte
+ // boundary so that the entire loop fits in one instruction-cache line.
+ uint64_t LoopSize = 0;
+ for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I)
+ for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J)
+ LoopSize += TII->GetInstSizeInBytes(J);
+
+ if (LoopSize > 16 && LoopSize <= 32)
+ return 5;
+
+ break;
+ }
+ }
+
+ return TargetLowering::getPrefLoopAlignment(ML);
+}
/// getConstraintType - Given a constraint, return the type of
/// constraint it is for this target.
// the AsmName field from *RegisterInfo.td, then this would not be necessary.
if (R.first && VT == MVT::i64 && Subtarget.isPPC64() &&
PPC::GPRCRegClass.contains(R.first)) {
- const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+ const TargetRegisterInfo *TRI =
+ getTargetMachine().getSubtargetImpl()->getRegisterInfo();
return std::make_pair(TRI->getMatchingSuperReg(R.first,
PPC::sub_32, &PPC::G8RCRegClass),
&PPC::G8RCRegClass);
}
+ // GCC accepts 'cc' as an alias for 'cr0', and we need to do the same.
+ if (!R.second && StringRef("{cc}").equals_lower(Constraint)) {
+ R.first = PPC::CR0;
+ R.second = &PPC::CRRCRegClass;
+ }
+
return R;
}
case 'P': {
ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op);
if (!CST) return; // Must be an immediate to match.
- unsigned Value = CST->getZExtValue();
+ int64_t Value = CST->getSExtValue();
+ EVT TCVT = MVT::i64; // All constants taken to be 64 bits so that negative
+ // numbers are printed as such.
switch (Letter) {
default: llvm_unreachable("Unknown constraint letter!");
case 'I': // "I" is a signed 16-bit constant.
- if ((short)Value == (int)Value)
- Result = DAG.getTargetConstant(Value, Op.getValueType());
+ if (isInt<16>(Value))
+ Result = DAG.getTargetConstant(Value, TCVT);
break;
case 'J': // "J" is a constant with only the high-order 16 bits nonzero.
+ if (isShiftedUInt<16, 16>(Value))
+ Result = DAG.getTargetConstant(Value, TCVT);
+ break;
case 'L': // "L" is a signed 16-bit constant shifted left 16 bits.
- if ((short)Value == 0)
- Result = DAG.getTargetConstant(Value, Op.getValueType());
+ if (isShiftedInt<16, 16>(Value))
+ Result = DAG.getTargetConstant(Value, TCVT);
break;
case 'K': // "K" is a constant with only the low-order 16 bits nonzero.
- if ((Value >> 16) == 0)
- Result = DAG.getTargetConstant(Value, Op.getValueType());
+ if (isUInt<16>(Value))
+ Result = DAG.getTargetConstant(Value, TCVT);
break;
case 'M': // "M" is a constant that is greater than 31.
if (Value > 31)
- Result = DAG.getTargetConstant(Value, Op.getValueType());
+ Result = DAG.getTargetConstant(Value, TCVT);
break;
case 'N': // "N" is a positive constant that is an exact power of two.
- if ((int)Value > 0 && isPowerOf2_32(Value))
- Result = DAG.getTargetConstant(Value, Op.getValueType());
+ if (Value > 0 && isPowerOf2_64(Value))
+ Result = DAG.getTargetConstant(Value, TCVT);
break;
case 'O': // "O" is the constant zero.
if (Value == 0)
- Result = DAG.getTargetConstant(Value, Op.getValueType());
+ Result = DAG.getTargetConstant(Value, TCVT);
break;
case 'P': // "P" is a constant whose negation is a signed 16-bit constant.
- if ((short)-Value == (int)-Value)
- Result = DAG.getTargetConstant(Value, Op.getValueType());
+ if (isInt<16>(-Value))
+ Result = DAG.getTargetConstant(Value, TCVT);
break;
}
break;
return false;
}
+bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+ const CallInst &I,
+ unsigned Intrinsic) const {
+
+ switch (Intrinsic) {
+ case Intrinsic::ppc_altivec_lvx:
+ case Intrinsic::ppc_altivec_lvxl:
+ case Intrinsic::ppc_altivec_lvebx:
+ case Intrinsic::ppc_altivec_lvehx:
+ case Intrinsic::ppc_altivec_lvewx:
+ case Intrinsic::ppc_vsx_lxvd2x:
+ case Intrinsic::ppc_vsx_lxvw4x: {
+ EVT VT;
+ switch (Intrinsic) {
+ case Intrinsic::ppc_altivec_lvebx:
+ VT = MVT::i8;
+ break;
+ case Intrinsic::ppc_altivec_lvehx:
+ VT = MVT::i16;
+ break;
+ case Intrinsic::ppc_altivec_lvewx:
+ VT = MVT::i32;
+ break;
+ case Intrinsic::ppc_vsx_lxvd2x:
+ VT = MVT::v2f64;
+ break;
+ default:
+ VT = MVT::v4i32;
+ break;
+ }
+
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = VT;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = -VT.getStoreSize()+1;
+ Info.size = 2*VT.getStoreSize()-1;
+ Info.align = 1;
+ Info.vol = false;
+ Info.readMem = true;
+ Info.writeMem = false;
+ return true;
+ }
+ case Intrinsic::ppc_altivec_stvx:
+ case Intrinsic::ppc_altivec_stvxl:
+ case Intrinsic::ppc_altivec_stvebx:
+ case Intrinsic::ppc_altivec_stvehx:
+ case Intrinsic::ppc_altivec_stvewx:
+ case Intrinsic::ppc_vsx_stxvd2x:
+ case Intrinsic::ppc_vsx_stxvw4x: {
+ EVT VT;
+ switch (Intrinsic) {
+ case Intrinsic::ppc_altivec_stvebx:
+ VT = MVT::i8;
+ break;
+ case Intrinsic::ppc_altivec_stvehx:
+ VT = MVT::i16;
+ break;
+ case Intrinsic::ppc_altivec_stvewx:
+ VT = MVT::i32;
+ break;
+ case Intrinsic::ppc_vsx_stxvd2x:
+ VT = MVT::v2f64;
+ break;
+ default:
+ VT = MVT::v4i32;
+ break;
+ }
+
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = VT;
+ Info.ptrVal = I.getArgOperand(1);
+ Info.offset = -VT.getStoreSize()+1;
+ Info.size = 2*VT.getStoreSize()-1;
+ Info.align = 1;
+ Info.vol = false;
+ Info.readMem = false;
+ Info.writeMem = true;
+ return true;
+ }
+ default:
+ break;
+ }
+
+ return false;
+}
+
/// getOptimalMemOpType - Returns the target specific optimal type for load
/// and store operations as a result of memset, memcpy, and memmove
/// lowering. If DstAlign is zero that means it's safe to destination
return NumBits1 == 64 && NumBits2 == 32;
}
+bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+ // Generally speaking, zexts are not free, but they are free when they can be
+ // folded with other operations.
+ if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val)) {
+ EVT MemVT = LD->getMemoryVT();
+ if ((MemVT == MVT::i1 || MemVT == MVT::i8 || MemVT == MVT::i16 ||
+ (Subtarget.isPPC64() && MemVT == MVT::i32)) &&
+ (LD->getExtensionType() == ISD::NON_EXTLOAD ||
+ LD->getExtensionType() == ISD::ZEXTLOAD))
+ return true;
+ }
+
+ // FIXME: Add other cases...
+ // - 32-bit shifts with a zext to i64
+ // - zext after ctlz, bswap, etc.
+ // - zext after and by a constant mask
+
+ return TargetLowering::isZExtFree(Val, VT2);
+}
+
+bool PPCTargetLowering::isFPExtFree(EVT VT) const {
+ assert(VT.isFloatingPoint());
+ return true;
+}
+
bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
return isInt<16>(Imm) || isUInt<16>(Imm);
}
return isInt<16>(Imm) || isUInt<16>(Imm);
}
-bool PPCTargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
- unsigned,
- bool *Fast) const {
+bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+ unsigned,
+ unsigned,
+ bool *Fast) const {
if (DisablePPCUnaligned)
return false;
if (VT.getSimpleVT().isVector()) {
if (Subtarget.hasVSX()) {
- if (VT != MVT::v2f64 && VT != MVT::v2i64)
+ if (VT != MVT::v2f64 && VT != MVT::v2i64 &&
+ VT != MVT::v4f32 && VT != MVT::v4i32)
return false;
} else {
return false;
return false;
}
+const MCPhysReg *
+PPCTargetLowering::getScratchRegisters(CallingConv::ID) const {
+ // LR is a callee-save register, but we must treat it as clobbered by any call
+ // site. Hence we include LR in the scratch registers, which are in turn added
+ // as implicit-defs for stackmaps and patchpoints. The same reasoning applies
+ // to CTR, which is used by any indirect call.
+ static const MCPhysReg ScratchRegs[] = {
+ PPC::X11, PPC::X12, PPC::LR8, PPC::CTR8, 0
+ };
+
+ return ScratchRegs;
+}
+
bool
PPCTargetLowering::shouldExpandBuildVectorWithShuffles(
EVT VT , unsigned DefinedValues) const {