X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FCellSPU%2FSPUISelLowering.cpp;h=19327d8acf46b741b02223901722c1c687ec2d75;hb=4d83b79c76044e3f3cefd2a6c1b0b792266935c8;hp=84dbb6a9a0f1c5d8e79b4f260c2757fcefd3a02d;hpb=72977a45a8ad9d9524c9b49399e89fb9a3a676ed;p=oota-llvm.git diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp index 84dbb6a9a0f..19327d8acf4 100644 --- a/lib/Target/CellSPU/SPUISelLowering.cpp +++ b/lib/Target/CellSPU/SPUISelLowering.cpp @@ -1,4 +1,3 @@ -// //===-- SPUISelLowering.cpp - Cell SPU DAG Lowering Implementation --------===// // The LLVM Compiler Infrastructure // @@ -11,21 +10,22 @@ // //===----------------------------------------------------------------------===// -#include "SPURegisterNames.h" #include "SPUISelLowering.h" #include "SPUTargetMachine.h" -#include "SPUFrameInfo.h" +#include "SPUFrameLowering.h" +#include "SPUMachineFunction.h" #include "llvm/Constants.h" #include "llvm/Function.h" #include "llvm/Intrinsics.h" #include "llvm/CallingConv.h" +#include "llvm/Type.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/Target/TargetOptions.h" #include "llvm/ADT/VectorExtras.h" #include "llvm/Support/Debug.h" @@ -40,44 +40,12 @@ using namespace llvm; namespace { std::map node_names; - //! EVT mapping to useful data for Cell SPU - struct valtype_map_s { - const EVT valtype; - const int prefslot_byte; - }; - - const valtype_map_s valtype_map[] = { - { MVT::i1, 3 }, - { MVT::i8, 3 }, - { MVT::i16, 2 }, - { MVT::i32, 0 }, - { MVT::f32, 0 }, - { MVT::i64, 0 }, - { MVT::f64, 0 }, - { MVT::i128, 0 } - }; - - const size_t n_valtype_map = sizeof(valtype_map) / sizeof(valtype_map[0]); - - const valtype_map_s *getValueTypeMapEntry(EVT VT) { - const valtype_map_s *retval = 0; - - for (size_t i = 0; i < n_valtype_map; ++i) { - if (valtype_map[i].valtype == VT) { - retval = valtype_map + i; - break; - } - } - -#ifndef NDEBUG - if (retval == 0) { - std::string msg; - raw_string_ostream Msg(msg); - Msg << "getValueTypeMapEntry returns NULL for " - << VT.getEVTString(); - llvm_report_error(Msg.str()); - } -#endif + // Byte offset of the preferred slot (counted from the MSB) + int prefslotOffset(EVT VT) { + int retval=0; + if (VT==MVT::i1) retval=3; + if (VT==MVT::i8) retval=3; + if (VT==MVT::i16) retval=2; return retval; } @@ -91,7 +59,7 @@ namespace { SDValue ExpandLibCall(RTLIB::Libcall LC, SDValue Op, SelectionDAG &DAG, - bool isSigned, SDValue &Hi, SPUTargetLowering &TLI) { + bool isSigned, SDValue &Hi, const SPUTargetLowering &TLI) { // The input chain to this libcall is the entry node of the function. // Legalizing the call will automatically add the previous call to the // dependence. @@ -101,7 +69,7 @@ namespace { TargetLowering::ArgListEntry Entry; for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { EVT ArgVT = Op.getOperand(i).getValueType(); - const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Node = Op.getOperand(i); Entry.Ty = ArgTy; Entry.isSExt = isSigned; @@ -112,14 +80,13 @@ namespace { TLI.getPointerTy()); // Splice the libcall in wherever FindInputOutputChains tells us to. - const Type *RetTy = + Type *RetTy = Op.getNode()->getValueType(0).getTypeForEVT(*DAG.getContext()); std::pair CallInfo = TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false, 0, TLI.getLibcallCallingConv(LC), false, /*isReturnValueUsed=*/true, - Callee, Args, DAG, - Op.getDebugLoc()); + Callee, Args, DAG, Op.getDebugLoc()); return CallInfo.first; } @@ -128,8 +95,6 @@ namespace { SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) : TargetLowering(TM, new TargetLoweringObjectFileELF()), SPUTM(TM) { - // Fold away setcc operations if possible. - setPow2DivIsCheap(); // Use _setjmp/_longjmp instead of setjmp/longjmp. setUseUnderscoreSetJmp(true); @@ -209,6 +174,7 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) // SPU has no intrinsics for these particular operations: setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand); + setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand); // SPU has no division/remainder instructions setOperationAction(ISD::SREM, MVT::i8, Expand); @@ -255,6 +221,9 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setOperationAction(ISD::FSQRT, MVT::f64, Expand); setOperationAction(ISD::FSQRT, MVT::f32, Expand); + setOperationAction(ISD::FMA, MVT::f64, Expand); + setOperationAction(ISD::FMA, MVT::f32, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); @@ -350,6 +319,9 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) // Custom lower i128 -> i64 truncates setOperationAction(ISD::TRUNCATE, MVT::i64, Custom); + // Custom lower i32/i64 -> i128 sign extend + setOperationAction(ISD::SIGN_EXTEND, MVT::i128, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote); setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote); setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote); @@ -376,18 +348,14 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); - setOperationAction(ISD::BIT_CONVERT, MVT::i32, Legal); - setOperationAction(ISD::BIT_CONVERT, MVT::f32, Legal); - setOperationAction(ISD::BIT_CONVERT, MVT::i64, Legal); - setOperationAction(ISD::BIT_CONVERT, MVT::f64, Legal); + setOperationAction(ISD::BITCAST, MVT::i32, Legal); + setOperationAction(ISD::BITCAST, MVT::f32, Legal); + setOperationAction(ISD::BITCAST, MVT::i64, Legal); + setOperationAction(ISD::BITCAST, MVT::f64, Legal); // We cannot sextinreg(i1). Expand to shifts. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); - // Support label based line numbers. - setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand); - setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand); - // We want to legalize GlobalAddress and ConstantPool nodes into the // appropriate instructions to materialize the address. for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::f128; @@ -430,13 +398,13 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) addRegisterClass(MVT::v4f32, SPU::VECREGRegisterClass); addRegisterClass(MVT::v2f64, SPU::VECREGRegisterClass); - // "Odd size" vector classes that we're willing to support: - addRegisterClass(MVT::v2i32, SPU::VECREGRegisterClass); - for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) { MVT::SimpleValueType VT = (MVT::SimpleValueType)i; + // Set operation actions to legal types only. + if (!isTypeLegal(VT)) continue; + // add/sub are legal for all supported vector VT's. setOperationAction(ISD::ADD, VT, Legal); setOperationAction(ISD::SUB, VT, Legal); @@ -446,9 +414,9 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setOperationAction(ISD::AND, VT, Legal); setOperationAction(ISD::OR, VT, Legal); setOperationAction(ISD::XOR, VT, Legal); - setOperationAction(ISD::LOAD, VT, Legal); + setOperationAction(ISD::LOAD, VT, Custom); setOperationAction(ISD::SELECT, VT, Legal); - setOperationAction(ISD::STORE, VT, Legal); + setOperationAction(ISD::STORE, VT, Custom); // These operations need to be expanded: setOperationAction(ISD::SDIV, VT, Expand); @@ -456,6 +424,13 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); + // Expand all trunc stores + for (unsigned j = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; + j <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++j) { + MVT::SimpleValueType TargetVT = (MVT::SimpleValueType)j; + setTruncStoreAction(VT, TargetVT, Expand); + } + // Custom lower build_vector, constant pool spills, insert and // extract vector elements: setOperationAction(ISD::BUILD_VECTOR, VT, Custom); @@ -466,6 +441,8 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); } + setOperationAction(ISD::SHL, MVT::v2i64, Expand); + setOperationAction(ISD::AND, MVT::v16i8, Custom); setOperationAction(ISD::OR, MVT::v16i8, Custom); setOperationAction(ISD::XOR, MVT::v16i8, Custom); @@ -473,8 +450,8 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setOperationAction(ISD::FDIV, MVT::v4f32, Legal); - setShiftAmountType(MVT::i32); setBooleanContents(ZeroOrNegativeOneBooleanContent); + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // FIXME: Is this correct? setStackPointerRegisterToSaveRestore(SPU::R1); @@ -484,12 +461,14 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); + setMinFunctionAlignment(3); + computeRegisterProperties(); // Set pre-RA register scheduler default to BURR, which produces slightly // better code than the default (could also be TDRR, but TargetLowering.h // needs a mod to support that model): - setSchedulingPreference(SchedulingForRegPressure); + setSchedulingPreference(Sched::RegPressure); } const char * @@ -509,11 +488,8 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB"; node_names[(unsigned) SPUISD::PREFSLOT2VEC] = "SPUISD::PREFSLOT2VEC"; node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT"; - node_names[(unsigned) SPUISD::SHLQUAD_L_BITS] = "SPUISD::SHLQUAD_L_BITS"; - node_names[(unsigned) SPUISD::SHLQUAD_L_BYTES] = "SPUISD::SHLQUAD_L_BYTES"; - node_names[(unsigned) SPUISD::VEC_SHL] = "SPUISD::VEC_SHL"; - node_names[(unsigned) SPUISD::VEC_SRL] = "SPUISD::VEC_SRL"; - node_names[(unsigned) SPUISD::VEC_SRA] = "SPUISD::VEC_SRA"; + node_names[(unsigned) SPUISD::SHL_BITS] = "SPUISD::SHL_BITS"; + node_names[(unsigned) SPUISD::SHL_BYTES] = "SPUISD::SHL_BYTES"; node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL"; node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR"; node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT"; @@ -531,20 +507,25 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const return ((i != node_names.end()) ? i->second : 0); } -/// getFunctionAlignment - Return the Log2 alignment of this function. -unsigned SPUTargetLowering::getFunctionAlignment(const Function *) const { - return 3; -} - //===----------------------------------------------------------------------===// // Return the Cell SPU's SETCC result type //===----------------------------------------------------------------------===// -MVT::SimpleValueType SPUTargetLowering::getSetCCResultType(EVT VT) const { - // i16 and i32 are valid SETCC result types - return ((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) ? - VT.getSimpleVT().SimpleTy : - MVT::i32); +EVT SPUTargetLowering::getSetCCResultType(EVT VT) const { + // i8, i16 and i32 are valid SETCC result types + MVT::SimpleValueType retval; + + switch(VT.getSimpleVT().SimpleTy){ + case MVT::i1: + case MVT::i8: + retval = MVT::i8; break; + case MVT::i16: + retval = MVT::i16; break; + case MVT::i32: + default: + retval = MVT::i32; + } + return retval; } //===----------------------------------------------------------------------===// @@ -582,113 +563,174 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { EVT OutVT = Op.getValueType(); ISD::LoadExtType ExtType = LN->getExtensionType(); unsigned alignment = LN->getAlignment(); - const valtype_map_s *vtm = getValueTypeMapEntry(InVT); + int pso = prefslotOffset(InVT); DebugLoc dl = Op.getDebugLoc(); - - switch (LN->getAddressingMode()) { - case ISD::UNINDEXED: { - SDValue result; - SDValue basePtr = LN->getBasePtr(); - SDValue rotate; - - if (alignment == 16) { - ConstantSDNode *CN; - - // Special cases for a known aligned load to simplify the base pointer - // and the rotation amount: - if (basePtr.getOpcode() == ISD::ADD - && (CN = dyn_cast (basePtr.getOperand(1))) != 0) { - // Known offset into basePtr - int64_t offset = CN->getSExtValue(); - int64_t rotamt = int64_t((offset & 0xf) - vtm->prefslot_byte); - - if (rotamt < 0) - rotamt += 16; - - rotate = DAG.getConstant(rotamt, MVT::i16); - - // Simplify the base pointer for this case: - basePtr = basePtr.getOperand(0); - if ((offset & ~0xf) > 0) { - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, - basePtr, - DAG.getConstant((offset & ~0xf), PtrVT)); - } - } else if ((basePtr.getOpcode() == SPUISD::AFormAddr) - || (basePtr.getOpcode() == SPUISD::IndirectAddr - && basePtr.getOperand(0).getOpcode() == SPUISD::Hi - && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) { - // Plain aligned a-form address: rotate into preferred slot - // Same for (SPUindirect (SPUhi ...), (SPUlo ...)) - int64_t rotamt = -vtm->prefslot_byte; - if (rotamt < 0) - rotamt += 16; - rotate = DAG.getConstant(rotamt, MVT::i16); - } else { - // Offset the rotate amount by the basePtr and the preferred slot - // byte offset - int64_t rotamt = -vtm->prefslot_byte; - if (rotamt < 0) - rotamt += 16; - rotate = DAG.getNode(ISD::ADD, dl, PtrVT, - basePtr, - DAG.getConstant(rotamt, PtrVT)); - } - } else { - // Unaligned load: must be more pessimistic about addressing modes: - if (basePtr.getOpcode() == ISD::ADD) { - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &RegInfo = MF.getRegInfo(); - unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass); - SDValue Flag; - - SDValue Op0 = basePtr.getOperand(0); - SDValue Op1 = basePtr.getOperand(1); - - if (isa(Op1)) { - // Convert the (add , ) to an indirect address contained - // in a register. Note that this is done because we need to avoid - // creating a 0(reg) d-form address due to the SPU's block loads. - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); - the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag); - basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT); - } else { - // Convert the (add , ) to an indirect address, which - // will likely be lowered as a reg(reg) x-form address. - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); - } - } else { + EVT vecVT = InVT.isVector()? InVT: EVT::getVectorVT(*DAG.getContext(), InVT, + (128 / InVT.getSizeInBits())); + + // two sanity checks + assert( LN->getAddressingMode() == ISD::UNINDEXED + && "we should get only UNINDEXED adresses"); + // clean aligned loads can be selected as-is + if (InVT.getSizeInBits() == 128 && (alignment%16) == 0) + return SDValue(); + + // Get pointerinfos to the memory chunk(s) that contain the data to load + uint64_t mpi_offset = LN->getPointerInfo().Offset; + mpi_offset -= mpi_offset%16; + MachinePointerInfo lowMemPtr(LN->getPointerInfo().V, mpi_offset); + MachinePointerInfo highMemPtr(LN->getPointerInfo().V, mpi_offset+16); + + SDValue result; + SDValue basePtr = LN->getBasePtr(); + SDValue rotate; + + if ((alignment%16) == 0) { + ConstantSDNode *CN; + + // Special cases for a known aligned load to simplify the base pointer + // and the rotation amount: + if (basePtr.getOpcode() == ISD::ADD + && (CN = dyn_cast (basePtr.getOperand(1))) != 0) { + // Known offset into basePtr + int64_t offset = CN->getSExtValue(); + int64_t rotamt = int64_t((offset & 0xf) - pso); + + if (rotamt < 0) + rotamt += 16; + + rotate = DAG.getConstant(rotamt, MVT::i16); + + // Simplify the base pointer for this case: + basePtr = basePtr.getOperand(0); + if ((offset & ~0xf) > 0) { basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, basePtr, - DAG.getConstant(0, PtrVT)); + DAG.getConstant((offset & ~0xf), PtrVT)); } - + } else if ((basePtr.getOpcode() == SPUISD::AFormAddr) + || (basePtr.getOpcode() == SPUISD::IndirectAddr + && basePtr.getOperand(0).getOpcode() == SPUISD::Hi + && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) { + // Plain aligned a-form address: rotate into preferred slot + // Same for (SPUindirect (SPUhi ...), (SPUlo ...)) + int64_t rotamt = -pso; + if (rotamt < 0) + rotamt += 16; + rotate = DAG.getConstant(rotamt, MVT::i16); + } else { // Offset the rotate amount by the basePtr and the preferred slot // byte offset + int64_t rotamt = -pso; + if (rotamt < 0) + rotamt += 16; rotate = DAG.getNode(ISD::ADD, dl, PtrVT, basePtr, - DAG.getConstant(-vtm->prefslot_byte, PtrVT)); + DAG.getConstant(rotamt, PtrVT)); } - - // Re-emit as a v16i8 vector load - result = DAG.getLoad(MVT::v16i8, dl, the_chain, basePtr, - LN->getSrcValue(), LN->getSrcValueOffset(), - LN->isVolatile(), 16); - + } else { + // Unaligned load: must be more pessimistic about addressing modes: + if (basePtr.getOpcode() == ISD::ADD) { + MachineFunction &MF = DAG.getMachineFunction(); + MachineRegisterInfo &RegInfo = MF.getRegInfo(); + unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass); + SDValue Flag; + + SDValue Op0 = basePtr.getOperand(0); + SDValue Op1 = basePtr.getOperand(1); + + if (isa(Op1)) { + // Convert the (add , ) to an indirect address contained + // in a register. Note that this is done because we need to avoid + // creating a 0(reg) d-form address due to the SPU's block loads. + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); + the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag); + basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT); + } else { + // Convert the (add , ) to an indirect address, which + // will likely be lowered as a reg(reg) x-form address. + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); + } + } else { + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, + basePtr, + DAG.getConstant(0, PtrVT)); + } + + // Offset the rotate amount by the basePtr and the preferred slot + // byte offset + rotate = DAG.getNode(ISD::ADD, dl, PtrVT, + basePtr, + DAG.getConstant(-pso, PtrVT)); + } + + // Do the load as a i128 to allow possible shifting + SDValue low = DAG.getLoad(MVT::i128, dl, the_chain, basePtr, + lowMemPtr, + LN->isVolatile(), LN->isNonTemporal(), 16); + + // When the size is not greater than alignment we get all data with just + // one load + if (alignment >= InVT.getSizeInBits()/8) { // Update the chain - the_chain = result.getValue(1); + the_chain = low.getValue(1); // Rotate into the preferred slot: - result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::v16i8, - result.getValue(0), rotate); + result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::i128, + low.getValue(0), rotate); // Convert the loaded v16i8 vector to the appropriate vector type // specified by the operand: - EVT vecVT = EVT::getVectorVT(*DAG.getContext(), + EVT vecVT = EVT::getVectorVT(*DAG.getContext(), InVT, (128 / InVT.getSizeInBits())); result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT, - DAG.getNode(ISD::BIT_CONVERT, dl, vecVT, result)); + DAG.getNode(ISD::BITCAST, dl, vecVT, result)); + } + // When alignment is less than the size, we might need (known only at + // run-time) two loads + // TODO: if the memory address is composed only from constants, we have + // extra kowledge, and might avoid the second load + else { + // storage position offset from lower 16 byte aligned memory chunk + SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32, + basePtr, DAG.getConstant( 0xf, MVT::i32 ) ); + // get a registerfull of ones. (this implementation is a workaround: LLVM + // cannot handle 128 bit signed int constants) + SDValue ones = DAG.getConstant(-1, MVT::v4i32 ); + ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones); + + SDValue high = DAG.getLoad(MVT::i128, dl, the_chain, + DAG.getNode(ISD::ADD, dl, PtrVT, + basePtr, + DAG.getConstant(16, PtrVT)), + highMemPtr, + LN->isVolatile(), LN->isNonTemporal(), 16); + + the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1), + high.getValue(1)); + + // Shift the (possible) high part right to compensate the misalignemnt. + // if there is no highpart (i.e. value is i64 and offset is 4), this + // will zero out the high value. + high = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, high, + DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant( 16, MVT::i32), + offset + )); + + // Shift the low similarly + // TODO: add SPUISD::SHL_BYTES + low = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, low, offset ); + + // Merge the two parts + result = DAG.getNode(ISD::BITCAST, dl, vecVT, + DAG.getNode(ISD::OR, dl, MVT::i128, low, high)); + + if (!InVT.isVector()) { + result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT, result ); + } + } // Handle extending loads by extending the scalar result: if (ExtType == ISD::SEXTLOAD) { result = DAG.getNode(ISD::SIGN_EXTEND, dl, OutVT, result); @@ -712,24 +754,6 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { result = DAG.getNode(SPUISD::LDRESULT, dl, retvts, retops, sizeof(retops) / sizeof(retops[0])); return result; - } - case ISD::PRE_INC: - case ISD::PRE_DEC: - case ISD::POST_INC: - case ISD::POST_DEC: - case ISD::LAST_INDEXED_MODE: - { - std::string msg; - raw_string_ostream Msg(msg); - Msg << "LowerLOAD: Got a LoadSDNode with an addr mode other than " - "UNINDEXED\n"; - Msg << (unsigned) LN->getAddressingMode(); - llvm_report_error(Msg.str()); - /*NOTREACHED*/ - } - } - - return SDValue(); } /// Custom lower stores for CellSPU @@ -747,93 +771,103 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); DebugLoc dl = Op.getDebugLoc(); unsigned alignment = SN->getAlignment(); + SDValue result; + EVT vecVT = StVT.isVector()? StVT: EVT::getVectorVT(*DAG.getContext(), StVT, + (128 / StVT.getSizeInBits())); + // Get pointerinfos to the memory chunk(s) that contain the data to load + uint64_t mpi_offset = SN->getPointerInfo().Offset; + mpi_offset -= mpi_offset%16; + MachinePointerInfo lowMemPtr(SN->getPointerInfo().V, mpi_offset); + MachinePointerInfo highMemPtr(SN->getPointerInfo().V, mpi_offset+16); + + + // two sanity checks + assert( SN->getAddressingMode() == ISD::UNINDEXED + && "we should get only UNINDEXED adresses"); + // clean aligned loads can be selected as-is + if (StVT.getSizeInBits() == 128 && (alignment%16) == 0) + return SDValue(); + + SDValue alignLoadVec; + SDValue basePtr = SN->getBasePtr(); + SDValue the_chain = SN->getChain(); + SDValue insertEltOffs; + + if ((alignment%16) == 0) { + ConstantSDNode *CN; + // Special cases for a known aligned load to simplify the base pointer + // and insertion byte: + if (basePtr.getOpcode() == ISD::ADD + && (CN = dyn_cast(basePtr.getOperand(1))) != 0) { + // Known offset into basePtr + int64_t offset = CN->getSExtValue(); + + // Simplify the base pointer for this case: + basePtr = basePtr.getOperand(0); + insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, + basePtr, + DAG.getConstant((offset & 0xf), PtrVT)); - switch (SN->getAddressingMode()) { - case ISD::UNINDEXED: { - // The vector type we really want to load from the 16-byte chunk. - EVT vecVT = EVT::getVectorVT(*DAG.getContext(), - VT, (128 / VT.getSizeInBits())), - stVecVT = EVT::getVectorVT(*DAG.getContext(), - StVT, (128 / StVT.getSizeInBits())); - - SDValue alignLoadVec; - SDValue basePtr = SN->getBasePtr(); - SDValue the_chain = SN->getChain(); - SDValue insertEltOffs; - - if (alignment == 16) { - ConstantSDNode *CN; - - // Special cases for a known aligned load to simplify the base pointer - // and insertion byte: - if (basePtr.getOpcode() == ISD::ADD - && (CN = dyn_cast(basePtr.getOperand(1))) != 0) { - // Known offset into basePtr - int64_t offset = CN->getSExtValue(); - - // Simplify the base pointer for this case: - basePtr = basePtr.getOperand(0); - insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, - basePtr, - DAG.getConstant((offset & 0xf), PtrVT)); - - if ((offset & ~0xf) > 0) { - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, - basePtr, - DAG.getConstant((offset & ~0xf), PtrVT)); - } - } else { - // Otherwise, assume it's at byte 0 of basePtr - insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, - basePtr, - DAG.getConstant(0, PtrVT)); - } - } else { - // Unaligned load: must be more pessimistic about addressing modes: - if (basePtr.getOpcode() == ISD::ADD) { - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &RegInfo = MF.getRegInfo(); - unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass); - SDValue Flag; - - SDValue Op0 = basePtr.getOperand(0); - SDValue Op1 = basePtr.getOperand(1); - - if (isa(Op1)) { - // Convert the (add , ) to an indirect address contained - // in a register. Note that this is done because we need to avoid - // creating a 0(reg) d-form address due to the SPU's block loads. - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); - the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag); - basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT); - } else { - // Convert the (add , ) to an indirect address, which - // will likely be lowered as a reg(reg) x-form address. - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); - } - } else { + if ((offset & ~0xf) > 0) { basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, basePtr, - DAG.getConstant(0, PtrVT)); + DAG.getConstant((offset & ~0xf), PtrVT)); } - - // Insertion point is solely determined by basePtr's contents - insertEltOffs = DAG.getNode(ISD::ADD, dl, PtrVT, + } else { + // Otherwise, assume it's at byte 0 of basePtr + insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, + basePtr, + DAG.getConstant(0, PtrVT)); + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, basePtr, DAG.getConstant(0, PtrVT)); } + } else { + // Unaligned load: must be more pessimistic about addressing modes: + if (basePtr.getOpcode() == ISD::ADD) { + MachineFunction &MF = DAG.getMachineFunction(); + MachineRegisterInfo &RegInfo = MF.getRegInfo(); + unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass); + SDValue Flag; + + SDValue Op0 = basePtr.getOperand(0); + SDValue Op1 = basePtr.getOperand(1); + + if (isa(Op1)) { + // Convert the (add , ) to an indirect address contained + // in a register. Note that this is done because we need to avoid + // creating a 0(reg) d-form address due to the SPU's block loads. + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); + the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag); + basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT); + } else { + // Convert the (add , ) to an indirect address, which + // will likely be lowered as a reg(reg) x-form address. + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); + } + } else { + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, + basePtr, + DAG.getConstant(0, PtrVT)); + } - // Re-emit as a v16i8 vector load - alignLoadVec = DAG.getLoad(MVT::v16i8, dl, the_chain, basePtr, - SN->getSrcValue(), SN->getSrcValueOffset(), - SN->isVolatile(), 16); + // Insertion point is solely determined by basePtr's contents + insertEltOffs = DAG.getNode(ISD::ADD, dl, PtrVT, + basePtr, + DAG.getConstant(0, PtrVT)); + } + + // Load the lower part of the memory to which to store. + SDValue low = DAG.getLoad(vecVT, dl, the_chain, basePtr, + lowMemPtr, SN->isVolatile(), SN->isNonTemporal(), 16); + // if we don't need to store over the 16 byte boundary, one store suffices + if (alignment >= StVT.getSizeInBits()/8) { // Update the chain - the_chain = alignLoadVec.getValue(1); + the_chain = low.getValue(1); - LoadSDNode *LN = cast(alignLoadVec); + LoadSDNode *LN = cast(low); SDValue theValue = SN->getValue(); - SDValue result; if (StVT != VT && (theValue.getOpcode() == ISD::AssertZext @@ -849,58 +883,122 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { // to the stack pointer, which is always aligned. #if !defined(NDEBUG) if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) { - cerr << "CellSPU LowerSTORE: basePtr = "; + errs() << "CellSPU LowerSTORE: basePtr = "; basePtr.getNode()->dump(&DAG); - cerr << "\n"; + errs() << "\n"; } #endif - SDValue insertEltOp = - DAG.getNode(SPUISD::SHUFFLE_MASK, dl, vecVT, insertEltOffs); - SDValue vectorizeOp = - DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, vecVT, theValue); + SDValue insertEltOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, vecVT, + insertEltOffs); + SDValue vectorizeOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, vecVT, + theValue); result = DAG.getNode(SPUISD::SHUFB, dl, vecVT, - vectorizeOp, alignLoadVec, - DAG.getNode(ISD::BIT_CONVERT, dl, + vectorizeOp, low, + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, insertEltOp)); result = DAG.getStore(the_chain, dl, result, basePtr, - LN->getSrcValue(), LN->getSrcValueOffset(), - LN->isVolatile(), LN->getAlignment()); - -#if 0 && !defined(NDEBUG) - if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) { - const SDValue ¤tRoot = DAG.getRoot(); - - DAG.setRoot(result); - cerr << "------- CellSPU:LowerStore result:\n"; - DAG.dump(); - cerr << "-------\n"; - DAG.setRoot(currentRoot); - } -#endif - - return result; - /*UNREACHED*/ - } - case ISD::PRE_INC: - case ISD::PRE_DEC: - case ISD::POST_INC: - case ISD::POST_DEC: - case ISD::LAST_INDEXED_MODE: - { - std::string msg; - raw_string_ostream Msg(msg); - Msg << "LowerLOAD: Got a LoadSDNode with an addr mode other than " - "UNINDEXED\n"; - Msg << (unsigned) SN->getAddressingMode(); - llvm_report_error(Msg.str()); - /*NOTREACHED*/ - } + lowMemPtr, + LN->isVolatile(), LN->isNonTemporal(), + 16); + + } + // do the store when it might cross the 16 byte memory access boundary. + else { + // TODO issue a warning if SN->isVolatile()== true? This is likely not + // what the user wanted. + + // address offset from nearest lower 16byte alinged address + SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32, + SN->getBasePtr(), + DAG.getConstant(0xf, MVT::i32)); + // 16 - offset + SDValue offset_compl = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant( 16, MVT::i32), + offset); + // 16 - sizeof(Value) + SDValue surplus = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant( 16, MVT::i32), + DAG.getConstant( VT.getSizeInBits()/8, + MVT::i32)); + // get a registerfull of ones + SDValue ones = DAG.getConstant(-1, MVT::v4i32); + ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones); + + // Create the 128 bit masks that have ones where the data to store is + // located. + SDValue lowmask, himask; + // if the value to store don't fill up the an entire 128 bits, zero + // out the last bits of the mask so that only the value we want to store + // is masked. + // this is e.g. in the case of store i32, align 2 + if (!VT.isVector()){ + Value = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, Value); + lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, ones, surplus); + lowmask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask, + surplus); + Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value); + Value = DAG.getNode(ISD::AND, dl, MVT::i128, Value, lowmask); + + } + else { + lowmask = ones; + Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value); + } + // this will zero, if there are no data that goes to the high quad + himask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask, + offset_compl); + lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, lowmask, + offset); + + // Load in the old data and zero out the parts that will be overwritten with + // the new data to store. + SDValue hi = DAG.getLoad(MVT::i128, dl, the_chain, + DAG.getNode(ISD::ADD, dl, PtrVT, basePtr, + DAG.getConstant( 16, PtrVT)), + highMemPtr, + SN->isVolatile(), SN->isNonTemporal(), 16); + the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1), + hi.getValue(1)); + + low = DAG.getNode(ISD::AND, dl, MVT::i128, + DAG.getNode( ISD::BITCAST, dl, MVT::i128, low), + DAG.getNode( ISD::XOR, dl, MVT::i128, lowmask, ones)); + hi = DAG.getNode(ISD::AND, dl, MVT::i128, + DAG.getNode( ISD::BITCAST, dl, MVT::i128, hi), + DAG.getNode( ISD::XOR, dl, MVT::i128, himask, ones)); + + // Shift the Value to store into place. rlow contains the parts that go to + // the lower memory chunk, rhi has the parts that go to the upper one. + SDValue rlow = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, Value, offset); + rlow = DAG.getNode(ISD::AND, dl, MVT::i128, rlow, lowmask); + SDValue rhi = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, Value, + offset_compl); + + // Merge the old data and the new data and store the results + // Need to convert vectors here to integer as 'OR'ing floats assert + rlow = DAG.getNode(ISD::OR, dl, MVT::i128, + DAG.getNode(ISD::BITCAST, dl, MVT::i128, low), + DAG.getNode(ISD::BITCAST, dl, MVT::i128, rlow)); + rhi = DAG.getNode(ISD::OR, dl, MVT::i128, + DAG.getNode(ISD::BITCAST, dl, MVT::i128, hi), + DAG.getNode(ISD::BITCAST, dl, MVT::i128, rhi)); + + low = DAG.getStore(the_chain, dl, rlow, basePtr, + lowMemPtr, + SN->isVolatile(), SN->isNonTemporal(), 16); + hi = DAG.getStore(the_chain, dl, rhi, + DAG.getNode(ISD::ADD, dl, PtrVT, basePtr, + DAG.getConstant( 16, PtrVT)), + highMemPtr, + SN->isVolatile(), SN->isNonTemporal(), 16); + result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(0), + hi.getValue(0)); } - return SDValue(); + return result; } //! Generate the address of a constant pool entry. @@ -908,7 +1006,7 @@ static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { EVT PtrVT = Op.getValueType(); ConstantPoolSDNode *CP = cast(Op); - Constant *C = CP->getConstVal(); + const Constant *C = CP->getConstVal(); SDValue CPI = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment()); SDValue Zero = DAG.getConstant(0, PtrVT); const TargetMachine &TM = DAG.getTarget(); @@ -966,8 +1064,9 @@ static SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { EVT PtrVT = Op.getValueType(); GlobalAddressSDNode *GSDN = cast(Op); - GlobalValue *GV = GSDN->getGlobal(); - SDValue GA = DAG.getTargetGlobalAddress(GV, PtrVT, GSDN->getOffset()); + const GlobalValue *GV = GSDN->getGlobal(); + SDValue GA = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), + PtrVT, GSDN->getOffset()); const TargetMachine &TM = DAG.getTarget(); SDValue Zero = DAG.getConstant(0, PtrVT); // FIXME there is no actual debug info here @@ -982,7 +1081,7 @@ LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo); } } else { - llvm_report_error("LowerGlobalAddress: Relocation model other than static" + report_fatal_error("LowerGlobalAddress: Relocation model other than static" "not supported."); /*NOTREACHED*/ } @@ -1007,7 +1106,7 @@ LowerConstantFP(SDValue Op, SelectionDAG &DAG) { SDValue T = DAG.getConstant(dbits, MVT::i64); SDValue Tvec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T); return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Tvec)); + DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Tvec)); } return SDValue(); @@ -1015,42 +1114,44 @@ LowerConstantFP(SDValue Op, SelectionDAG &DAG) { SDValue SPUTargetLowering::LowerFormalArguments(SDValue Chain, - unsigned CallConv, bool isVarArg, + CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, DebugLoc dl, SelectionDAG &DAG, - SmallVectorImpl &InVals) { + SmallVectorImpl &InVals) + const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); MachineRegisterInfo &RegInfo = MF.getRegInfo(); + SPUFunctionInfo *FuncInfo = MF.getInfo(); - const unsigned *ArgRegs = SPURegisterInfo::getArgRegs(); - const unsigned NumArgRegs = SPURegisterInfo::getNumArgRegs(); - - unsigned ArgOffset = SPUFrameInfo::minStackSize(); + unsigned ArgOffset = SPUFrameLowering::minStackSize(); unsigned ArgRegIdx = 0; - unsigned StackSlotSize = SPUFrameInfo::stackSlotSize(); + unsigned StackSlotSize = SPUFrameLowering::stackSlotSize(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + SmallVector ArgLocs; + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); + // FIXME: allow for other calling conventions + CCInfo.AnalyzeFormalArguments(Ins, CCC_SPU); + // Add DAG nodes to load the arguments or copy them out of registers. for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { EVT ObjectVT = Ins[ArgNo].VT; unsigned ObjSize = ObjectVT.getSizeInBits()/8; SDValue ArgVal; + CCValAssign &VA = ArgLocs[ArgNo]; - if (ArgRegIdx < NumArgRegs) { + if (VA.isRegLoc()) { const TargetRegisterClass *ArgRegClass; switch (ObjectVT.getSimpleVT().SimpleTy) { - default: { - std::string msg; - raw_string_ostream Msg(msg); - Msg << "LowerFormalArguments Unhandled argument type: " - << ObjectVT.getEVTString(); - llvm_report_error(Msg.str()); - } + default: + report_fatal_error("LowerFormalArguments Unhandled argument type: " + + Twine(ObjectVT.getEVTString())); case MVT::i8: ArgRegClass = &SPU::R8CRegClass; break; @@ -1083,16 +1184,17 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain, } unsigned VReg = RegInfo.createVirtualRegister(ArgRegClass); - RegInfo.addLiveIn(ArgRegs[ArgRegIdx], VReg); + RegInfo.addLiveIn(VA.getLocReg(), VReg); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); ++ArgRegIdx; } else { // We need to load the argument to a virtual register if we determined // above that we ran out of physical registers of the appropriate type // or we're forced to do vararg - int FI = MFI->CreateFixedObject(ObjSize, ArgOffset); + int FI = MFI->CreateFixedObject(ObjSize, ArgOffset, true); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, NULL, 0); + ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(), + false, false, 0); ArgOffset += StackSlotSize; } @@ -1103,17 +1205,36 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain, // vararg handling: if (isVarArg) { - // unsigned int ptr_size = PtrVT.getSizeInBits() / 8; + // FIXME: we should be able to query the argument registers from + // tablegen generated code. + static const unsigned ArgRegs[] = { + SPU::R3, SPU::R4, SPU::R5, SPU::R6, SPU::R7, SPU::R8, SPU::R9, + SPU::R10, SPU::R11, SPU::R12, SPU::R13, SPU::R14, SPU::R15, SPU::R16, + SPU::R17, SPU::R18, SPU::R19, SPU::R20, SPU::R21, SPU::R22, SPU::R23, + SPU::R24, SPU::R25, SPU::R26, SPU::R27, SPU::R28, SPU::R29, SPU::R30, + SPU::R31, SPU::R32, SPU::R33, SPU::R34, SPU::R35, SPU::R36, SPU::R37, + SPU::R38, SPU::R39, SPU::R40, SPU::R41, SPU::R42, SPU::R43, SPU::R44, + SPU::R45, SPU::R46, SPU::R47, SPU::R48, SPU::R49, SPU::R50, SPU::R51, + SPU::R52, SPU::R53, SPU::R54, SPU::R55, SPU::R56, SPU::R57, SPU::R58, + SPU::R59, SPU::R60, SPU::R61, SPU::R62, SPU::R63, SPU::R64, SPU::R65, + SPU::R66, SPU::R67, SPU::R68, SPU::R69, SPU::R70, SPU::R71, SPU::R72, + SPU::R73, SPU::R74, SPU::R75, SPU::R76, SPU::R77, SPU::R78, SPU::R79 + }; + // size of ArgRegs array + unsigned NumArgRegs = 77; + // We will spill (79-3)+1 registers to the stack SmallVector MemOps; // Create the frame slot - for (; ArgRegIdx != NumArgRegs; ++ArgRegIdx) { - VarArgsFrameIndex = MFI->CreateFixedObject(StackSlotSize, ArgOffset); - SDValue FIN = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT); - SDValue ArgVal = DAG.getRegister(ArgRegs[ArgRegIdx], MVT::v16i8); - SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, NULL, 0); + FuncInfo->setVarArgsFrameIndex( + MFI->CreateFixedObject(StackSlotSize, ArgOffset, true)); + SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); + unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::VECREGRegClass); + SDValue ArgVal = DAG.getRegister(VReg, MVT::v16i8); + SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, MachinePointerInfo(), + false, false, 0); Chain = Store.getOperand(0); MemOps.push_back(Store); @@ -1144,27 +1265,32 @@ static SDNode *isLSAAddress(SDValue Op, SelectionDAG &DAG) { SDValue SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, - unsigned CallConv, bool isVarArg, - bool isTailCall, + CallingConv::ID CallConv, bool isVarArg, + bool &isTailCall, const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, DebugLoc dl, SelectionDAG &DAG, - SmallVectorImpl &InVals) { + SmallVectorImpl &InVals) const { + // CellSPU target does not yet support tail call optimization. + isTailCall = false; const SPUSubtarget *ST = SPUTM.getSubtargetImpl(); unsigned NumOps = Outs.size(); - unsigned StackSlotSize = SPUFrameInfo::stackSlotSize(); - const unsigned *ArgRegs = SPURegisterInfo::getArgRegs(); - const unsigned NumArgRegs = SPURegisterInfo::getNumArgRegs(); + unsigned StackSlotSize = SPUFrameLowering::stackSlotSize(); + + SmallVector ArgLocs; + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); + // FIXME: allow for other calling conventions + CCInfo.AnalyzeCallOperands(Outs, CCC_SPU); + + const unsigned NumArgRegs = ArgLocs.size(); + // Handy pointer type EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); - // Accumulate how many bytes are to be pushed on the stack, including the - // linkage area, and parameter passing area. According to the SPU ABI, - // we minimally need space for [LR] and [SP] - unsigned NumStackBytes = SPUFrameInfo::minStackSize(); - // Set up a copy of the stack pointer for use loading and storing any // arguments that may not fit in the registers available for argument // passing. @@ -1172,7 +1298,7 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Figure out which arguments are going to go in registers, and which in // memory. - unsigned ArgOffset = SPUFrameInfo::minStackSize(); // Just below [LR] + unsigned ArgOffset = SPUFrameLowering::minStackSize(); // Just below [LR] unsigned ArgRegIdx = 0; // Keep track of registers passing arguments @@ -1180,8 +1306,9 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // And the arguments passed on the stack SmallVector MemOpChains; - for (unsigned i = 0; i != NumOps; ++i) { - SDValue Arg = Outs[i].Val; + for (; ArgRegIdx != NumOps; ++ArgRegIdx) { + SDValue Arg = OutVals[ArgRegIdx]; + CCValAssign &VA = ArgLocs[ArgRegIdx]; // PtrOff will be used to store the current argument to the stack if a // register cannot be found for it. @@ -1195,22 +1322,8 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, case MVT::i32: case MVT::i64: case MVT::i128: - if (ArgRegIdx != NumArgRegs) { - RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg)); - } else { - MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0)); - ArgOffset += StackSlotSize; - } - break; case MVT::f32: case MVT::f64: - if (ArgRegIdx != NumArgRegs) { - RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg)); - } else { - MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0)); - ArgOffset += StackSlotSize; - } - break; case MVT::v2i64: case MVT::v2f64: case MVT::v4f32: @@ -1218,17 +1331,23 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, case MVT::v8i16: case MVT::v16i8: if (ArgRegIdx != NumArgRegs) { - RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg)); + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); } else { - MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0)); + MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, + MachinePointerInfo(), + false, false, 0)); ArgOffset += StackSlotSize; } break; } } - // Update number of stack bytes actually used, insert a call sequence start - NumStackBytes = (ArgOffset - SPUFrameInfo::minStackSize()); + // Accumulate how many bytes are to be pushed on the stack, including the + // linkage area, and parameter passing area. According to the SPU ABI, + // we minimally need space for [LR] and [SP]. + unsigned NumStackBytes = ArgOffset - SPUFrameLowering::minStackSize(); + + // Insert a call sequence start Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumStackBytes, true)); @@ -1254,10 +1373,10 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol // node so that legalize doesn't hack it. if (GlobalAddressSDNode *G = dyn_cast(Callee)) { - GlobalValue *GV = G->getGlobal(); + const GlobalValue *GV = G->getGlobal(); EVT CalleeVT = Callee.getValueType(); SDValue Zero = DAG.getConstant(0, PtrVT); - SDValue GA = DAG.getTargetGlobalAddress(GV, CalleeVT); + SDValue GA = DAG.getTargetGlobalAddress(GV, dl, CalleeVT); if (!ST->usingLargeMem()) { // Turn calls to targets that are defined (i.e., have bodies) into BRSL @@ -1307,7 +1426,7 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (InFlag.getNode()) Ops.push_back(InFlag); // Returns a chain and a flag for retval copy to use. - Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Flag), + Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Glue), &Ops[0], Ops.size()); InFlag = Chain.getValue(1); @@ -1320,64 +1439,37 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (Ins.empty()) return Chain; + // Now handle the return value(s) + SmallVector RVLocs; + CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); + CCRetInfo.AnalyzeCallResult(Ins, CCC_SPU); + + // If the call has results, copy the values out of the ret val registers. - switch (Ins[0].VT.getSimpleVT().SimpleTy) { - default: llvm_unreachable("Unexpected ret value!"); - case MVT::Other: break; - case MVT::i32: - if (Ins.size() > 1 && Ins[1].VT == MVT::i32) { - Chain = DAG.getCopyFromReg(Chain, dl, SPU::R4, - MVT::i32, InFlag).getValue(1); - InVals.push_back(Chain.getValue(0)); - Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i32, - Chain.getValue(2)).getValue(1); - InVals.push_back(Chain.getValue(0)); - } else { - Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i32, - InFlag).getValue(1); - InVals.push_back(Chain.getValue(0)); - } - break; - case MVT::i64: - Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i64, - InFlag).getValue(1); - InVals.push_back(Chain.getValue(0)); - break; - case MVT::i128: - Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i128, - InFlag).getValue(1); - InVals.push_back(Chain.getValue(0)); - break; - case MVT::f32: - case MVT::f64: - Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, Ins[0].VT, - InFlag).getValue(1); - InVals.push_back(Chain.getValue(0)); - break; - case MVT::v2f64: - case MVT::v2i64: - case MVT::v4f32: - case MVT::v4i32: - case MVT::v8i16: - case MVT::v16i8: - Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, Ins[0].VT, - InFlag).getValue(1); - InVals.push_back(Chain.getValue(0)); - break; - } + for (unsigned i = 0; i != RVLocs.size(); ++i) { + CCValAssign VA = RVLocs[i]; + + SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), + InFlag); + Chain = Val.getValue(1); + InFlag = Val.getValue(2); + InVals.push_back(Val); + } return Chain; } SDValue SPUTargetLowering::LowerReturn(SDValue Chain, - unsigned CallConv, bool isVarArg, + CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, - DebugLoc dl, SelectionDAG &DAG) { + const SmallVectorImpl &OutVals, + DebugLoc dl, SelectionDAG &DAG) const { SmallVector RVLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_SPU); // If this is the first return lowered for this function, add the regs to the @@ -1394,7 +1486,7 @@ SPUTargetLowering::LowerReturn(SDValue Chain, CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), - Outs[i].Val, Flag); + OutVals[i], Flag); Flag = Chain.getValue(1); } @@ -1491,7 +1583,7 @@ SDValue SPU::get_vec_i10imm(SDNode *N, SelectionDAG &DAG, return SDValue(); Value = Value >> 32; } - if (isS10Constant(Value)) + if (isInt<10>(Value)) return DAG.getTargetConstant(Value, ValueType); } @@ -1580,21 +1672,17 @@ LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { uint64_t SplatBits = APSplatBits.getZExtValue(); switch (VT.getSimpleVT().SimpleTy) { - default: { - std::string msg; - raw_string_ostream Msg(msg); - Msg << "CellSPU: Unhandled VT in LowerBUILD_VECTOR, VT = " - << VT.getEVTString(); - llvm_report_error(Msg.str()); + default: + report_fatal_error("CellSPU: Unhandled VT in LowerBUILD_VECTOR, VT = " + + Twine(VT.getEVTString())); /*NOTREACHED*/ - } case MVT::v4f32: { uint32_t Value32 = uint32_t(SplatBits); assert(SplatBitSize == 32 && "LowerBUILD_VECTOR: Unexpected floating point vector element."); // NOTE: pretend the constant is an integer. LLVM won't load FP constants SDValue T = DAG.getConstant(Value32, MVT::i32); - return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, + return DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, T,T,T,T)); break; } @@ -1604,7 +1692,7 @@ LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { && "LowerBUILD_VECTOR: 64-bit float vector size > 8 bytes."); // NOTE: pretend the constant is an integer. LLVM won't load FP constants SDValue T = DAG.getConstant(f64val, MVT::i64); - return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, + return DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T)); break; } @@ -1614,7 +1702,7 @@ LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { SmallVector Ops; Ops.assign(8, DAG.getConstant(Value16, MVT::i16)); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, + return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i16, &Ops[0], Ops.size())); } case MVT::v8i16: { @@ -1629,10 +1717,6 @@ LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType()); return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T, T, T); } - case MVT::v2i32: { - SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType()); - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T); - } case MVT::v2i64: { return SPU::LowerV2I64Splat(VT, DAG, SplatBits, dl); } @@ -1652,7 +1736,7 @@ SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal, if (upper == lower) { // Magic constant that can be matched by IL, ILA, et. al. SDValue Val = DAG.getTargetConstant(upper, MVT::i32); - return DAG.getNode(ISD::BIT_CONVERT, dl, OpVT, + return DAG.getNode(ISD::BITCAST, dl, OpVT, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Val, Val, Val, Val)); } else { @@ -1681,7 +1765,7 @@ SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal, // Create lower vector if not a special pattern if (!lower_special) { SDValue LO32C = DAG.getConstant(lower, MVT::i32); - LO32 = DAG.getNode(ISD::BIT_CONVERT, dl, OpVT, + LO32 = DAG.getNode(ISD::BITCAST, dl, OpVT, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, LO32C, LO32C, LO32C, LO32C)); } @@ -1689,7 +1773,7 @@ SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal, // Create upper vector if not a special pattern if (!upper_special) { SDValue HI32C = DAG.getConstant(upper, MVT::i32); - HI32 = DAG.getNode(ISD::BIT_CONVERT, dl, OpVT, + HI32 = DAG.getNode(ISD::BITCAST, dl, OpVT, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, HI32C, HI32C, HI32C, HI32C)); } @@ -1756,41 +1840,51 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { // If we have a single element being moved from V1 to V2, this can be handled // using the C*[DX] compute mask instructions, but the vector elements have - // to be monotonically increasing with one exception element. + // to be monotonically increasing with one exception element, and the source + // slot of the element to move must be the same as the destination. EVT VecVT = V1.getValueType(); EVT EltVT = VecVT.getVectorElementType(); unsigned EltsFromV2 = 0; - unsigned V2Elt = 0; + unsigned V2EltOffset = 0; unsigned V2EltIdx0 = 0; unsigned CurrElt = 0; unsigned MaxElts = VecVT.getVectorNumElements(); unsigned PrevElt = 0; - unsigned V0Elt = 0; bool monotonic = true; bool rotate = true; + int rotamt=0; + EVT maskVT; // which of the c?d instructions to use if (EltVT == MVT::i8) { V2EltIdx0 = 16; + maskVT = MVT::v16i8; } else if (EltVT == MVT::i16) { V2EltIdx0 = 8; + maskVT = MVT::v8i16; } else if (EltVT == MVT::i32 || EltVT == MVT::f32) { V2EltIdx0 = 4; + maskVT = MVT::v4i32; } else if (EltVT == MVT::i64 || EltVT == MVT::f64) { V2EltIdx0 = 2; + maskVT = MVT::v2i64; } else llvm_unreachable("Unhandled vector type in LowerVECTOR_SHUFFLE"); for (unsigned i = 0; i != MaxElts; ++i) { if (SVN->getMaskElt(i) < 0) continue; - + unsigned SrcElt = SVN->getMaskElt(i); if (monotonic) { if (SrcElt >= V2EltIdx0) { - if (1 >= (++EltsFromV2)) { - V2Elt = (V2EltIdx0 - SrcElt) << 2; - } + // TODO: optimize for the monotonic case when several consecutive + // elements are taken form V2. Do we ever get such a case? + if (EltsFromV2 == 0 && CurrElt == (SrcElt - V2EltIdx0)) + V2EltOffset = (SrcElt - V2EltIdx0) * (EltVT.getSizeInBits()/8); + else + monotonic = false; + ++EltsFromV2; } else if (CurrElt != SrcElt) { monotonic = false; } @@ -1803,13 +1897,12 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { if ((PrevElt == SrcElt - 1) || (PrevElt == MaxElts - 1 && SrcElt == 0)) { PrevElt = SrcElt; - if (SrcElt == 0) - V0Elt = i; } else { rotate = false; } - } else if (PrevElt == 0) { - // First time through, need to keep track of previous element + } else if (i == 0 || (PrevElt==0 && SrcElt==1)) { + // First time or after a "wrap around" + rotamt = SrcElt-i; PrevElt = SrcElt; } else { // This isn't a rotation, takes elements from vector 2 @@ -1820,24 +1913,23 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { if (EltsFromV2 == 1 && monotonic) { // Compute mask and shuffle - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &RegInfo = MF.getRegInfo(); - unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); - // Initialize temporary register to 0 - SDValue InitTempReg = - DAG.getCopyToReg(DAG.getEntryNode(), dl, VReg, DAG.getConstant(0, PtrVT)); - // Copy register's contents as index in SHUFFLE_MASK: - SDValue ShufMaskOp = - DAG.getNode(SPUISD::SHUFFLE_MASK, dl, MVT::v4i32, - DAG.getTargetConstant(V2Elt, MVT::i32), - DAG.getCopyFromReg(InitTempReg, dl, VReg, PtrVT)); + + // As SHUFFLE_MASK becomes a c?d instruction, feed it an address + // R1 ($sp) is used here only as it is guaranteed to have last bits zero + SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, + DAG.getRegister(SPU::R1, PtrVT), + DAG.getConstant(V2EltOffset, MVT::i32)); + SDValue ShufMaskOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, + maskVT, Pointer); + // Use shuffle mask in SHUFB synthetic instruction: return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V2, V1, ShufMaskOp); } else if (rotate) { - int rotamt = (MaxElts - V0Elt) * EltVT.getSizeInBits()/8; - + if (rotamt < 0) + rotamt +=MaxElts; + rotamt *= EltVT.getSizeInBits()/8; return DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, V1.getValueType(), V1, DAG.getConstant(rotamt, MVT::i16)); } else { @@ -1852,7 +1944,6 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { for (unsigned j = 0; j < BytesPerElement; ++j) ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,MVT::i8)); } - SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8, &ResultMask[0], ResultMask.size()); return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V1, V2, VPermMask); @@ -1963,7 +2054,9 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { assert(prefslot_begin != -1 && prefslot_end != -1 && "LowerEXTRACT_VECTOR_ELT: preferred slots uninitialized"); - unsigned int ShufBytes[16]; + unsigned int ShufBytes[16] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; for (int i = 0; i < 16; ++i) { // zero fill uppper part of preferred slot, don't care about the // other slots: @@ -2000,8 +2093,8 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { // Variable index: Rotate the requested element into slot 0, then replicate // slot 0 across the vector EVT VecVT = N.getValueType(); - if (!VecVT.isSimple() || !VecVT.isVector() || !VecVT.is128BitVector()) { - llvm_report_error("LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit" + if (!VecVT.isSimple() || !VecVT.isVector()) { + report_fatal_error("LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit" "vector type!"); } @@ -2021,7 +2114,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { DAG.getConstant(scaleShift, MVT::i32)); } - vecShift = DAG.getNode(SPUISD::SHLQUAD_L_BYTES, dl, VecVT, N, Elt); + vecShift = DAG.getNode(SPUISD::SHL_BYTES, dl, VecVT, N, Elt); // Replicate the bytes starting at byte 0 across the entire vector (for // consistency with the notion of a unified register set) @@ -2029,7 +2122,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { switch (VT.getSimpleVT().SimpleTy) { default: - llvm_report_error("LowerEXTRACT_VECTOR_ELT(varable): Unhandled vector" + report_fatal_error("LowerEXTRACT_VECTOR_ELT(varable): Unhandled vector" "type"); /*NOTREACHED*/ case MVT::i8: { @@ -2075,22 +2168,31 @@ static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { SDValue IdxOp = Op.getOperand(2); DebugLoc dl = Op.getDebugLoc(); EVT VT = Op.getValueType(); + EVT eltVT = ValOp.getValueType(); - ConstantSDNode *CN = cast(IdxOp); - assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!"); + // use 0 when the lane to insert to is 'undef' + int64_t Offset=0; + if (IdxOp.getOpcode() != ISD::UNDEF) { + ConstantSDNode *CN = cast(IdxOp); + assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!"); + Offset = (CN->getSExtValue()) * eltVT.getSizeInBits()/8; + } EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); // Use $sp ($1) because it's always 16-byte aligned and it's available: SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, DAG.getRegister(SPU::R1, PtrVT), - DAG.getConstant(CN->getSExtValue(), PtrVT)); - SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, VT, Pointer); + DAG.getConstant(Offset, PtrVT)); + // widen the mask when dealing with half vectors + EVT maskVT = EVT::getVectorVT(*(DAG.getContext()), VT.getVectorElementType(), + 128/ VT.getVectorElementType().getSizeInBits()); + SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, maskVT, Pointer); SDValue result = DAG.getNode(SPUISD::SHUFB, dl, VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, ValOp), VecOp, - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, ShufMask)); + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ShufMask)); return result; } @@ -2100,7 +2202,7 @@ static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc, { SDValue N0 = Op.getOperand(0); // Everything has at least one operand DebugLoc dl = Op.getDebugLoc(); - EVT ShiftVT = TLI.getShiftAmountTy(); + EVT ShiftVT = TLI.getShiftAmountTy(N0.getValueType()); assert(Op.getValueType() == MVT::i8); switch (Opc) { @@ -2210,12 +2312,12 @@ LowerByteImmed(SDValue Op, SelectionDAG &DAG) { ConstVec = Op.getOperand(0); Arg = Op.getOperand(1); if (ConstVec.getNode()->getOpcode() != ISD::BUILD_VECTOR) { - if (ConstVec.getNode()->getOpcode() == ISD::BIT_CONVERT) { + if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) { ConstVec = ConstVec.getOperand(0); } else { ConstVec = Op.getOperand(1); Arg = Op.getOperand(0); - if (ConstVec.getNode()->getOpcode() == ISD::BIT_CONVERT) { + if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) { ConstVec = ConstVec.getOperand(0); } } @@ -2256,7 +2358,7 @@ LowerByteImmed(SDValue Op, SelectionDAG &DAG) { */ static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); - EVT vecVT = EVT::getVectorVT(*DAG.getContext(), + EVT vecVT = EVT::getVectorVT(*DAG.getContext(), VT, (128 / VT.getSizeInBits())); DebugLoc dl = Op.getDebugLoc(); @@ -2365,7 +2467,7 @@ static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) { All conversions to i64 are expanded to a libcall. */ static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, - SPUTargetLowering &TLI) { + const SPUTargetLowering &TLI) { EVT OpVT = Op.getValueType(); SDValue Op0 = Op.getOperand(0); EVT Op0VT = Op0.getValueType(); @@ -2391,7 +2493,7 @@ static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, All conversions from i64 are expanded to a libcall. */ static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, - SPUTargetLowering &TLI) { + const SPUTargetLowering &TLI) { EVT OpVT = Op.getValueType(); SDValue Op0 = Op.getOperand(0); EVT Op0VT = Op0.getValueType(); @@ -2432,7 +2534,7 @@ static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG, // Take advantage of the fact that (truncate (sra arg, 32)) is efficiently // selected to a NOP: - SDValue i64lhs = DAG.getNode(ISD::BIT_CONVERT, dl, IntVT, lhs); + SDValue i64lhs = DAG.getNode(ISD::BITCAST, dl, IntVT, lhs); SDValue lhsHi32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, DAG.getNode(ISD::SRL, dl, IntVT, @@ -2466,7 +2568,7 @@ static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG, ISD::SETGT)); } - SDValue i64rhs = DAG.getNode(ISD::BIT_CONVERT, dl, IntVT, rhs); + SDValue i64rhs = DAG.getNode(ISD::BITCAST, dl, IntVT, rhs); SDValue rhsHi32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, DAG.getNode(ISD::SRL, dl, IntVT, @@ -2512,7 +2614,7 @@ static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG, case ISD::SETONE: compareOp = ISD::SETNE; break; default: - llvm_report_error("CellSPU ISel Select: unimplemented f64 condition"); + report_fatal_error("CellSPU ISel Select: unimplemented f64 condition"); } SDValue result = @@ -2580,7 +2682,7 @@ static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) // Type to truncate to EVT VT = Op.getValueType(); MVT simpleVT = VT.getSimpleVT(); - EVT VecVT = EVT::getVectorVT(*DAG.getContext(), + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), VT, (128 / VT.getSizeInBits())); DebugLoc dl = Op.getDebugLoc(); @@ -2588,7 +2690,7 @@ static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) SDValue Op0 = Op.getOperand(0); EVT Op0VT = Op0.getValueType(); - if (Op0VT.getSimpleVT() == MVT::i128 && simpleVT == MVT::i64) { + if (Op0VT == MVT::i128 && simpleVT == MVT::i64) { // Create shuffle mask, least significant doubleword of quadword unsigned maskHigh = 0x08090a0b; unsigned maskLow = 0x0c0d0e0f; @@ -2608,13 +2710,78 @@ static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) return SDValue(); // Leave the truncate unmolested } +/*! + * Emit the instruction sequence for i64/i32 -> i128 sign extend. The basic + * algorithm is to duplicate the sign bit using rotmai to generate at + * least one byte full of sign bits. Then propagate the "sign-byte" into + * the leftmost words and the i64/i32 into the rightmost words using shufb. + * + * @param Op The sext operand + * @param DAG The current DAG + * @return The SDValue with the entire instruction sequence + */ +static SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) +{ + DebugLoc dl = Op.getDebugLoc(); + + // Type to extend to + MVT OpVT = Op.getValueType().getSimpleVT(); + + // Type to extend from + SDValue Op0 = Op.getOperand(0); + MVT Op0VT = Op0.getValueType().getSimpleVT(); + + // extend i8 & i16 via i32 + if (Op0VT == MVT::i8 || Op0VT == MVT::i16) { + Op0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Op0); + Op0VT = MVT::i32; + } + + // The type to extend to needs to be a i128 and + // the type to extend from needs to be i64 or i32. + assert((OpVT == MVT::i128 && (Op0VT == MVT::i64 || Op0VT == MVT::i32)) && + "LowerSIGN_EXTEND: input and/or output operand have wrong size"); + (void)OpVT; + + // Create shuffle mask + unsigned mask1 = 0x10101010; // byte 0 - 3 and 4 - 7 + unsigned mask2 = Op0VT == MVT::i64 ? 0x00010203 : 0x10101010; // byte 8 - 11 + unsigned mask3 = Op0VT == MVT::i64 ? 0x04050607 : 0x00010203; // byte 12 - 15 + SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, + DAG.getConstant(mask1, MVT::i32), + DAG.getConstant(mask1, MVT::i32), + DAG.getConstant(mask2, MVT::i32), + DAG.getConstant(mask3, MVT::i32)); + + // Word wise arithmetic right shift to generate at least one byte + // that contains sign bits. + MVT mvt = Op0VT == MVT::i64 ? MVT::v2i64 : MVT::v4i32; + SDValue sraVal = DAG.getNode(ISD::SRA, + dl, + mvt, + DAG.getNode(SPUISD::PREFSLOT2VEC, dl, mvt, Op0, Op0), + DAG.getConstant(31, MVT::i32)); + + // reinterpret as a i128 (SHUFB requires it). This gets lowered away. + SDValue extended = SDValue(DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, + dl, Op0VT, Op0, + DAG.getTargetConstant( + SPU::GPRCRegClass.getID(), + MVT::i32)), 0); + // Shuffle bytes - Copy the sign bits into the upper 64 bits + // and the input value into the lower 64 bits. + SDValue extShuffle = DAG.getNode(SPUISD::SHUFB, dl, mvt, + extended, sraVal, shufMask); + return DAG.getNode(ISD::BITCAST, dl, MVT::i128, extShuffle); +} + //! Custom (target-specific) lowering entry point /*! This is where LLVM's DAG selection process calls to do target-specific lowering of nodes. */ SDValue -SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) +SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { unsigned Opc = (unsigned) Op.getOpcode(); EVT VT = Op.getValueType(); @@ -2622,9 +2789,9 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) switch (Opc) { default: { #ifndef NDEBUG - cerr << "SPUTargetLowering::LowerOperation(): need to lower this!\n"; - cerr << "Op.getOpcode() = " << Opc << "\n"; - cerr << "*Op.getNode():\n"; + errs() << "SPUTargetLowering::LowerOperation(): need to lower this!\n"; + errs() << "Op.getOpcode() = " << Opc << "\n"; + errs() << "*Op.getNode():\n"; Op.getNode()->dump(); #endif llvm_unreachable(0); @@ -2700,6 +2867,9 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); + + case ISD::SIGN_EXTEND: + return LowerSIGN_EXTEND(Op, DAG); } return SDValue(); @@ -2707,7 +2877,7 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) void SPUTargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl&Results, - SelectionDAG &DAG) + SelectionDAG &DAG) const { #if 0 unsigned Opc = (unsigned) N->getOpcode(); @@ -2715,9 +2885,9 @@ void SPUTargetLowering::ReplaceNodeResults(SDNode *N, switch (Opc) { default: { - cerr << "SPUTargetLowering::ReplaceNodeResults(): need to fix this!\n"; - cerr << "Op.getOpcode() = " << Opc << "\n"; - cerr << "*Op.getNode():\n"; + errs() << "SPUTargetLowering::ReplaceNodeResults(): need to fix this!\n"; + errs() << "Op.getOpcode() = " << Opc << "\n"; + errs() << "*Op.getNode():\n"; N->dump(); abort(); /*NOTREACHED*/ @@ -2771,7 +2941,7 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const #if !defined(NDEBUG) if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) { - cerr << "\n" + errs() << "\n" << "Replace: (add (SPUindirect , ), 0)\n" << "With: (SPUindirect , )\n"; } @@ -2787,7 +2957,7 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const #if !defined(NDEBUG) if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) { - cerr << "\n" + errs() << "\n" << "Replace: (add (SPUindirect , " << CN1->getSExtValue() << "), " << CN0->getSExtValue() << ")\n" << "With: (SPUindirect , " @@ -2811,11 +2981,11 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const // Types must match, however... #if !defined(NDEBUG) if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) { - cerr << "\nReplace: "; + errs() << "\nReplace: "; N->dump(&DAG); - cerr << "\nWith: "; + errs() << "\nWith: "; Op0.getNode()->dump(&DAG); - cerr << "\n"; + errs() << "\n"; } #endif @@ -2826,15 +2996,15 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const case SPUISD::IndirectAddr: { if (!ST->usingLargeMem() && Op0.getOpcode() == SPUISD::AFormAddr) { ConstantSDNode *CN = dyn_cast(N->getOperand(1)); - if (CN != 0 && CN->getZExtValue() == 0) { + if (CN != 0 && CN->isNullValue()) { // (SPUindirect (SPUaform , 0), 0) -> // (SPUaform , 0) - DEBUG(cerr << "Replace: "); + DEBUG(errs() << "Replace: "); DEBUG(N->dump(&DAG)); - DEBUG(cerr << "\nWith: "); + DEBUG(errs() << "\nWith: "); DEBUG(Op0.getNode()->dump(&DAG)); - DEBUG(cerr << "\n"); + DEBUG(errs() << "\n"); return Op0; } @@ -2847,7 +3017,7 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const #if !defined(NDEBUG) if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) { - cerr << "\n" + errs() << "\n" << "Replace: (SPUindirect (add , ), 0)\n" << "With: (SPUindirect , )\n"; } @@ -2860,11 +3030,8 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const } break; } - case SPUISD::SHLQUAD_L_BITS: - case SPUISD::SHLQUAD_L_BYTES: - case SPUISD::VEC_SHL: - case SPUISD::VEC_SRL: - case SPUISD::VEC_SRA: + case SPUISD::SHL_BITS: + case SPUISD::SHL_BYTES: case SPUISD::ROTBYTES_LEFT: { SDValue Op1 = N->getOperand(1); @@ -2909,11 +3076,11 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const // Otherwise, return unchanged. #ifndef NDEBUG if (Result.getNode()) { - DEBUG(cerr << "\nReplace.SPU: "); + DEBUG(errs() << "\nReplace.SPU: "); DEBUG(N->dump(&DAG)); - DEBUG(cerr << "\nWith: "); + DEBUG(errs() << "\nWith: "); DEBUG(Result.getNode()->dump(&DAG)); - DEBUG(cerr << "\n"); + DEBUG(errs() << "\n"); } #endif @@ -2942,6 +3109,38 @@ SPUTargetLowering::getConstraintType(const std::string &ConstraintLetter) const return TargetLowering::getConstraintType(ConstraintLetter); } +/// Examine constraint type and operand type and determine a weight value. +/// This object must already have been set up with the operand type +/// and the current alternative constraint selected. +TargetLowering::ConstraintWeight +SPUTargetLowering::getSingleConstraintMatchWeight( + AsmOperandInfo &info, const char *constraint) const { + ConstraintWeight weight = CW_Invalid; + Value *CallOperandVal = info.CallOperandVal; + // If we don't have a value, we can't do a match, + // but allow it at the lowest weight. + if (CallOperandVal == NULL) + return CW_Default; + // Look at the constraint type. + switch (*constraint) { + default: + weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); + break; + //FIXME: Seems like the supported constraint letters were just copied + // from PPC, as the following doesn't correspond to the GCC docs. + // I'm leaving it so until someone adds the corresponding lowering support. + case 'b': + case 'r': + case 'f': + case 'd': + case 'v': + case 'y': + weight = CW_Register; + break; + } + return weight; +} + std::pair SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const @@ -2992,9 +3191,6 @@ SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, case SPUISD::VEC2PREFSLOT: case SPUISD::SHLQUAD_L_BITS: case SPUISD::SHLQUAD_L_BYTES: - case SPUISD::VEC_SHL: - case SPUISD::VEC_SRL: - case SPUISD::VEC_SRA: case SPUISD::VEC_ROTL: case SPUISD::VEC_ROTR: case SPUISD::ROTBYTES_LEFT: @@ -3025,19 +3221,17 @@ SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, // LowerAsmOperandForConstraint void SPUTargetLowering::LowerAsmOperandForConstraint(SDValue Op, - char ConstraintLetter, - bool hasMemory, + std::string &Constraint, std::vector &Ops, SelectionDAG &DAG) const { // Default, for the time being, to the base class handler - TargetLowering::LowerAsmOperandForConstraint(Op, ConstraintLetter, hasMemory, - Ops, DAG); + TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } /// isLegalAddressImmediate - Return true if the integer value can be used /// as the offset of the target addressing mode. bool SPUTargetLowering::isLegalAddressImmediate(int64_t V, - const Type *Ty) const { + Type *Ty) const { // SPU's addresses are 256K: return (V > -(1 << 18) && V < (1 << 18) - 1); } @@ -3051,3 +3245,28 @@ SPUTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // The SPU target isn't yet aware of offsets. return false; } + +// can we compare to Imm without writing it into a register? +bool SPUTargetLowering::isLegalICmpImmediate(int64_t Imm) const { + //ceqi, cgti, etc. all take s10 operand + return isInt<10>(Imm); +} + +bool +SPUTargetLowering::isLegalAddressingMode(const AddrMode &AM, + Type * ) const{ + + // A-form: 18bit absolute address. + if (AM.BaseGV && !AM.HasBaseReg && AM.Scale == 0 && AM.BaseOffs == 0) + return true; + + // D-form: reg + 14bit offset + if (AM.BaseGV ==0 && AM.HasBaseReg && AM.Scale == 0 && isInt<14>(AM.BaseOffs)) + return true; + + // X-form: reg+reg + if (AM.BaseGV == 0 && AM.HasBaseReg && AM.Scale == 1 && AM.BaseOffs ==0) + return true; + + return false; +}