X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FCellSPU%2FSPUISelLowering.cpp;h=36c8bd5684669459f451d17a683b8a5a032ce47f;hb=955ed73d12f2b186ef3f80da872b702cd7f2895b;hp=ba281cd463ccbdab76398b4ab52ca07dcba5504c;hpb=91fdee125cc95c322064a6f1ad8fba169f7ecebe;p=oota-llvm.git diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp index ba281cd463c..36c8bd56846 100644 --- a/lib/Target/CellSPU/SPUISelLowering.cpp +++ b/lib/Target/CellSPU/SPUISelLowering.cpp @@ -1,4 +1,3 @@ -// //===-- SPUISelLowering.cpp - Cell SPU DAG Lowering Implementation --------===// // The LLVM Compiler Infrastructure // @@ -14,12 +13,13 @@ #include "SPURegisterNames.h" #include "SPUISelLowering.h" #include "SPUTargetMachine.h" -#include "SPUFrameInfo.h" +#include "SPUFrameLowering.h" #include "SPUMachineFunction.h" #include "llvm/Constants.h" #include "llvm/Function.h" #include "llvm/Intrinsics.h" #include "llvm/CallingConv.h" +#include "llvm/Type.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -41,41 +41,12 @@ using namespace llvm; namespace { std::map node_names; - //! EVT mapping to useful data for Cell SPU - struct valtype_map_s { - EVT valtype; - int prefslot_byte; - }; - - const valtype_map_s valtype_map[] = { - { MVT::i1, 3 }, - { MVT::i8, 3 }, - { MVT::i16, 2 }, - { MVT::i32, 0 }, - { MVT::f32, 0 }, - { MVT::i64, 0 }, - { MVT::f64, 0 }, - { MVT::i128, 0 } - }; - - const size_t n_valtype_map = sizeof(valtype_map) / sizeof(valtype_map[0]); - - const valtype_map_s *getValueTypeMapEntry(EVT VT) { - const valtype_map_s *retval = 0; - - for (size_t i = 0; i < n_valtype_map; ++i) { - if (valtype_map[i].valtype == VT) { - retval = valtype_map + i; - break; - } - } - -#ifndef NDEBUG - if (retval == 0) { - report_fatal_error("getValueTypeMapEntry returns NULL for " + - Twine(VT.getEVTString())); - } -#endif + // Byte offset of the preferred slot (counted from the MSB) + int prefslotOffset(EVT VT) { + int retval=0; + if (VT==MVT::i1) retval=3; + if (VT==MVT::i8) retval=3; + if (VT==MVT::i16) retval=2; return retval; } @@ -125,8 +96,6 @@ namespace { SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) : TargetLowering(TM, new TargetLoweringObjectFileELF()), SPUTM(TM) { - // Fold away setcc operations if possible. - setPow2DivIsCheap(); // Use _setjmp/_longjmp instead of setjmp/longjmp. setUseUnderscoreSetJmp(true); @@ -376,10 +345,10 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); - setOperationAction(ISD::BIT_CONVERT, MVT::i32, Legal); - setOperationAction(ISD::BIT_CONVERT, MVT::f32, Legal); - setOperationAction(ISD::BIT_CONVERT, MVT::i64, Legal); - setOperationAction(ISD::BIT_CONVERT, MVT::f64, Legal); + setOperationAction(ISD::BITCAST, MVT::i32, Legal); + setOperationAction(ISD::BITCAST, MVT::f32, Legal); + setOperationAction(ISD::BITCAST, MVT::i64, Legal); + setOperationAction(ISD::BITCAST, MVT::f64, Legal); // We cannot sextinreg(i1). Expand to shifts. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); @@ -426,9 +395,6 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) addRegisterClass(MVT::v4f32, SPU::VECREGRegisterClass); addRegisterClass(MVT::v2f64, SPU::VECREGRegisterClass); - // "Odd size" vector classes that we're willing to support: - addRegisterClass(MVT::v2i32, SPU::VECREGRegisterClass); - for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) { MVT::SimpleValueType VT = (MVT::SimpleValueType)i; @@ -442,9 +408,9 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setOperationAction(ISD::AND, VT, Legal); setOperationAction(ISD::OR, VT, Legal); setOperationAction(ISD::XOR, VT, Legal); - setOperationAction(ISD::LOAD, VT, Legal); + setOperationAction(ISD::LOAD, VT, Custom); setOperationAction(ISD::SELECT, VT, Legal); - setOperationAction(ISD::STORE, VT, Legal); + setOperationAction(ISD::STORE, VT, Custom); // These operations need to be expanded: setOperationAction(ISD::SDIV, VT, Expand); @@ -505,8 +471,8 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB"; node_names[(unsigned) SPUISD::PREFSLOT2VEC] = "SPUISD::PREFSLOT2VEC"; node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT"; - node_names[(unsigned) SPUISD::SHLQUAD_L_BITS] = "SPUISD::SHLQUAD_L_BITS"; - node_names[(unsigned) SPUISD::SHLQUAD_L_BYTES] = "SPUISD::SHLQUAD_L_BYTES"; + node_names[(unsigned) SPUISD::SHL_BITS] = "SPUISD::SHL_BITS"; + node_names[(unsigned) SPUISD::SHL_BYTES] = "SPUISD::SHL_BYTES"; node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL"; node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR"; node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT"; @@ -534,10 +500,20 @@ unsigned SPUTargetLowering::getFunctionAlignment(const Function *) const { //===----------------------------------------------------------------------===// MVT::SimpleValueType SPUTargetLowering::getSetCCResultType(EVT VT) const { - // i16 and i32 are valid SETCC result types - return ((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) ? - VT.getSimpleVT().SimpleTy : - MVT::i32); + // i8, i16 and i32 are valid SETCC result types + MVT::SimpleValueType retval; + + switch(VT.getSimpleVT().SimpleTy){ + case MVT::i1: + case MVT::i8: + retval = MVT::i8; break; + case MVT::i16: + retval = MVT::i16; break; + case MVT::i32: + default: + retval = MVT::i32; + } + return retval; } //===----------------------------------------------------------------------===// @@ -575,113 +551,174 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { EVT OutVT = Op.getValueType(); ISD::LoadExtType ExtType = LN->getExtensionType(); unsigned alignment = LN->getAlignment(); - const valtype_map_s *vtm = getValueTypeMapEntry(InVT); + int pso = prefslotOffset(InVT); DebugLoc dl = Op.getDebugLoc(); - - switch (LN->getAddressingMode()) { - case ISD::UNINDEXED: { - SDValue result; - SDValue basePtr = LN->getBasePtr(); - SDValue rotate; - - if (alignment == 16) { - ConstantSDNode *CN; - - // Special cases for a known aligned load to simplify the base pointer - // and the rotation amount: - if (basePtr.getOpcode() == ISD::ADD - && (CN = dyn_cast (basePtr.getOperand(1))) != 0) { - // Known offset into basePtr - int64_t offset = CN->getSExtValue(); - int64_t rotamt = int64_t((offset & 0xf) - vtm->prefslot_byte); - - if (rotamt < 0) - rotamt += 16; - - rotate = DAG.getConstant(rotamt, MVT::i16); - - // Simplify the base pointer for this case: - basePtr = basePtr.getOperand(0); - if ((offset & ~0xf) > 0) { - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, - basePtr, - DAG.getConstant((offset & ~0xf), PtrVT)); - } - } else if ((basePtr.getOpcode() == SPUISD::AFormAddr) - || (basePtr.getOpcode() == SPUISD::IndirectAddr - && basePtr.getOperand(0).getOpcode() == SPUISD::Hi - && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) { - // Plain aligned a-form address: rotate into preferred slot - // Same for (SPUindirect (SPUhi ...), (SPUlo ...)) - int64_t rotamt = -vtm->prefslot_byte; - if (rotamt < 0) - rotamt += 16; - rotate = DAG.getConstant(rotamt, MVT::i16); - } else { - // Offset the rotate amount by the basePtr and the preferred slot - // byte offset - int64_t rotamt = -vtm->prefslot_byte; - if (rotamt < 0) - rotamt += 16; - rotate = DAG.getNode(ISD::ADD, dl, PtrVT, - basePtr, - DAG.getConstant(rotamt, PtrVT)); - } - } else { - // Unaligned load: must be more pessimistic about addressing modes: - if (basePtr.getOpcode() == ISD::ADD) { - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &RegInfo = MF.getRegInfo(); - unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass); - SDValue Flag; - - SDValue Op0 = basePtr.getOperand(0); - SDValue Op1 = basePtr.getOperand(1); - - if (isa(Op1)) { - // Convert the (add , ) to an indirect address contained - // in a register. Note that this is done because we need to avoid - // creating a 0(reg) d-form address due to the SPU's block loads. - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); - the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag); - basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT); - } else { - // Convert the (add , ) to an indirect address, which - // will likely be lowered as a reg(reg) x-form address. - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); - } - } else { + EVT vecVT = InVT.isVector()? InVT: EVT::getVectorVT(*DAG.getContext(), InVT, + (128 / InVT.getSizeInBits())); + + // two sanity checks + assert( LN->getAddressingMode() == ISD::UNINDEXED + && "we should get only UNINDEXED adresses"); + // clean aligned loads can be selected as-is + if (InVT.getSizeInBits() == 128 && (alignment%16) == 0) + return SDValue(); + + // Get pointerinfos to the memory chunk(s) that contain the data to load + uint64_t mpi_offset = LN->getPointerInfo().Offset; + mpi_offset -= mpi_offset%16; + MachinePointerInfo lowMemPtr(LN->getPointerInfo().V, mpi_offset); + MachinePointerInfo highMemPtr(LN->getPointerInfo().V, mpi_offset+16); + + SDValue result; + SDValue basePtr = LN->getBasePtr(); + SDValue rotate; + + if ((alignment%16) == 0) { + ConstantSDNode *CN; + + // Special cases for a known aligned load to simplify the base pointer + // and the rotation amount: + if (basePtr.getOpcode() == ISD::ADD + && (CN = dyn_cast (basePtr.getOperand(1))) != 0) { + // Known offset into basePtr + int64_t offset = CN->getSExtValue(); + int64_t rotamt = int64_t((offset & 0xf) - pso); + + if (rotamt < 0) + rotamt += 16; + + rotate = DAG.getConstant(rotamt, MVT::i16); + + // Simplify the base pointer for this case: + basePtr = basePtr.getOperand(0); + if ((offset & ~0xf) > 0) { basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, basePtr, - DAG.getConstant(0, PtrVT)); + DAG.getConstant((offset & ~0xf), PtrVT)); } - + } else if ((basePtr.getOpcode() == SPUISD::AFormAddr) + || (basePtr.getOpcode() == SPUISD::IndirectAddr + && basePtr.getOperand(0).getOpcode() == SPUISD::Hi + && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) { + // Plain aligned a-form address: rotate into preferred slot + // Same for (SPUindirect (SPUhi ...), (SPUlo ...)) + int64_t rotamt = -pso; + if (rotamt < 0) + rotamt += 16; + rotate = DAG.getConstant(rotamt, MVT::i16); + } else { // Offset the rotate amount by the basePtr and the preferred slot // byte offset + int64_t rotamt = -pso; + if (rotamt < 0) + rotamt += 16; rotate = DAG.getNode(ISD::ADD, dl, PtrVT, basePtr, - DAG.getConstant(-vtm->prefslot_byte, PtrVT)); + DAG.getConstant(rotamt, PtrVT)); } - - // Re-emit as a v16i8 vector load - result = DAG.getLoad(MVT::v16i8, dl, the_chain, basePtr, - LN->getSrcValue(), LN->getSrcValueOffset(), - LN->isVolatile(), LN->isNonTemporal(), 16); - + } else { + // Unaligned load: must be more pessimistic about addressing modes: + if (basePtr.getOpcode() == ISD::ADD) { + MachineFunction &MF = DAG.getMachineFunction(); + MachineRegisterInfo &RegInfo = MF.getRegInfo(); + unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass); + SDValue Flag; + + SDValue Op0 = basePtr.getOperand(0); + SDValue Op1 = basePtr.getOperand(1); + + if (isa(Op1)) { + // Convert the (add , ) to an indirect address contained + // in a register. Note that this is done because we need to avoid + // creating a 0(reg) d-form address due to the SPU's block loads. + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); + the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag); + basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT); + } else { + // Convert the (add , ) to an indirect address, which + // will likely be lowered as a reg(reg) x-form address. + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); + } + } else { + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, + basePtr, + DAG.getConstant(0, PtrVT)); + } + + // Offset the rotate amount by the basePtr and the preferred slot + // byte offset + rotate = DAG.getNode(ISD::ADD, dl, PtrVT, + basePtr, + DAG.getConstant(-pso, PtrVT)); + } + + // Do the load as a i128 to allow possible shifting + SDValue low = DAG.getLoad(MVT::i128, dl, the_chain, basePtr, + lowMemPtr, + LN->isVolatile(), LN->isNonTemporal(), 16); + + // When the size is not greater than alignment we get all data with just + // one load + if (alignment >= InVT.getSizeInBits()/8) { // Update the chain - the_chain = result.getValue(1); + the_chain = low.getValue(1); // Rotate into the preferred slot: - result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::v16i8, - result.getValue(0), rotate); + result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::i128, + low.getValue(0), rotate); // Convert the loaded v16i8 vector to the appropriate vector type // specified by the operand: - EVT vecVT = EVT::getVectorVT(*DAG.getContext(), + EVT vecVT = EVT::getVectorVT(*DAG.getContext(), InVT, (128 / InVT.getSizeInBits())); result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT, - DAG.getNode(ISD::BIT_CONVERT, dl, vecVT, result)); + DAG.getNode(ISD::BITCAST, dl, vecVT, result)); + } + // When alignment is less than the size, we might need (known only at + // run-time) two loads + // TODO: if the memory address is composed only from constants, we have + // extra kowledge, and might avoid the second load + else { + // storage position offset from lower 16 byte aligned memory chunk + SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32, + basePtr, DAG.getConstant( 0xf, MVT::i32 ) ); + // get a registerfull of ones. (this implementation is a workaround: LLVM + // cannot handle 128 bit signed int constants) + SDValue ones = DAG.getConstant(-1, MVT::v4i32 ); + ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones); + + SDValue high = DAG.getLoad(MVT::i128, dl, the_chain, + DAG.getNode(ISD::ADD, dl, PtrVT, + basePtr, + DAG.getConstant(16, PtrVT)), + highMemPtr, + LN->isVolatile(), LN->isNonTemporal(), 16); + + the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1), + high.getValue(1)); + + // Shift the (possible) high part right to compensate the misalignemnt. + // if there is no highpart (i.e. value is i64 and offset is 4), this + // will zero out the high value. + high = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, high, + DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant( 16, MVT::i32), + offset + )); + + // Shift the low similarily + // TODO: add SPUISD::SHL_BYTES + low = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, low, offset ); + + // Merge the two parts + result = DAG.getNode(ISD::BITCAST, dl, vecVT, + DAG.getNode(ISD::OR, dl, MVT::i128, low, high)); + + if (!InVT.isVector()) { + result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT, result ); + } + } // Handle extending loads by extending the scalar result: if (ExtType == ISD::SEXTLOAD) { result = DAG.getNode(ISD::SIGN_EXTEND, dl, OutVT, result); @@ -705,21 +742,6 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { result = DAG.getNode(SPUISD::LDRESULT, dl, retvts, retops, sizeof(retops) / sizeof(retops[0])); return result; - } - case ISD::PRE_INC: - case ISD::PRE_DEC: - case ISD::POST_INC: - case ISD::POST_DEC: - case ISD::LAST_INDEXED_MODE: - { - report_fatal_error("LowerLOAD: Got a LoadSDNode with an addr mode other " - "than UNINDEXED\n" + - Twine((unsigned)LN->getAddressingMode())); - /*NOTREACHED*/ - } - } - - return SDValue(); } /// Custom lower stores for CellSPU @@ -737,91 +759,103 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); DebugLoc dl = Op.getDebugLoc(); unsigned alignment = SN->getAlignment(); + SDValue result; + EVT vecVT = StVT.isVector()? StVT: EVT::getVectorVT(*DAG.getContext(), StVT, + (128 / StVT.getSizeInBits())); + // Get pointerinfos to the memory chunk(s) that contain the data to load + uint64_t mpi_offset = SN->getPointerInfo().Offset; + mpi_offset -= mpi_offset%16; + MachinePointerInfo lowMemPtr(SN->getPointerInfo().V, mpi_offset); + MachinePointerInfo highMemPtr(SN->getPointerInfo().V, mpi_offset+16); + + + // two sanity checks + assert( SN->getAddressingMode() == ISD::UNINDEXED + && "we should get only UNINDEXED adresses"); + // clean aligned loads can be selected as-is + if (StVT.getSizeInBits() == 128 && (alignment%16) == 0) + return SDValue(); + + SDValue alignLoadVec; + SDValue basePtr = SN->getBasePtr(); + SDValue the_chain = SN->getChain(); + SDValue insertEltOffs; + + if ((alignment%16) == 0) { + ConstantSDNode *CN; + // Special cases for a known aligned load to simplify the base pointer + // and insertion byte: + if (basePtr.getOpcode() == ISD::ADD + && (CN = dyn_cast(basePtr.getOperand(1))) != 0) { + // Known offset into basePtr + int64_t offset = CN->getSExtValue(); + + // Simplify the base pointer for this case: + basePtr = basePtr.getOperand(0); + insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, + basePtr, + DAG.getConstant((offset & 0xf), PtrVT)); - switch (SN->getAddressingMode()) { - case ISD::UNINDEXED: { - // The vector type we really want to load from the 16-byte chunk. - EVT vecVT = EVT::getVectorVT(*DAG.getContext(), - VT, (128 / VT.getSizeInBits())); - - SDValue alignLoadVec; - SDValue basePtr = SN->getBasePtr(); - SDValue the_chain = SN->getChain(); - SDValue insertEltOffs; - - if (alignment == 16) { - ConstantSDNode *CN; - - // Special cases for a known aligned load to simplify the base pointer - // and insertion byte: - if (basePtr.getOpcode() == ISD::ADD - && (CN = dyn_cast(basePtr.getOperand(1))) != 0) { - // Known offset into basePtr - int64_t offset = CN->getSExtValue(); - - // Simplify the base pointer for this case: - basePtr = basePtr.getOperand(0); - insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, - basePtr, - DAG.getConstant((offset & 0xf), PtrVT)); - - if ((offset & ~0xf) > 0) { - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, - basePtr, - DAG.getConstant((offset & ~0xf), PtrVT)); - } - } else { - // Otherwise, assume it's at byte 0 of basePtr - insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, - basePtr, - DAG.getConstant(0, PtrVT)); - } - } else { - // Unaligned load: must be more pessimistic about addressing modes: - if (basePtr.getOpcode() == ISD::ADD) { - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &RegInfo = MF.getRegInfo(); - unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass); - SDValue Flag; - - SDValue Op0 = basePtr.getOperand(0); - SDValue Op1 = basePtr.getOperand(1); - - if (isa(Op1)) { - // Convert the (add , ) to an indirect address contained - // in a register. Note that this is done because we need to avoid - // creating a 0(reg) d-form address due to the SPU's block loads. - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); - the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag); - basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT); - } else { - // Convert the (add , ) to an indirect address, which - // will likely be lowered as a reg(reg) x-form address. - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); - } - } else { + if ((offset & ~0xf) > 0) { basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, basePtr, - DAG.getConstant(0, PtrVT)); + DAG.getConstant((offset & ~0xf), PtrVT)); } - - // Insertion point is solely determined by basePtr's contents - insertEltOffs = DAG.getNode(ISD::ADD, dl, PtrVT, + } else { + // Otherwise, assume it's at byte 0 of basePtr + insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, + basePtr, + DAG.getConstant(0, PtrVT)); + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, basePtr, DAG.getConstant(0, PtrVT)); } + } else { + // Unaligned load: must be more pessimistic about addressing modes: + if (basePtr.getOpcode() == ISD::ADD) { + MachineFunction &MF = DAG.getMachineFunction(); + MachineRegisterInfo &RegInfo = MF.getRegInfo(); + unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass); + SDValue Flag; + + SDValue Op0 = basePtr.getOperand(0); + SDValue Op1 = basePtr.getOperand(1); + + if (isa(Op1)) { + // Convert the (add , ) to an indirect address contained + // in a register. Note that this is done because we need to avoid + // creating a 0(reg) d-form address due to the SPU's block loads. + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); + the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag); + basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT); + } else { + // Convert the (add , ) to an indirect address, which + // will likely be lowered as a reg(reg) x-form address. + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); + } + } else { + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, + basePtr, + DAG.getConstant(0, PtrVT)); + } - // Re-emit as a v16i8 vector load - alignLoadVec = DAG.getLoad(MVT::v16i8, dl, the_chain, basePtr, - SN->getSrcValue(), SN->getSrcValueOffset(), - SN->isVolatile(), SN->isNonTemporal(), 16); + // Insertion point is solely determined by basePtr's contents + insertEltOffs = DAG.getNode(ISD::ADD, dl, PtrVT, + basePtr, + DAG.getConstant(0, PtrVT)); + } + + // Load the lower part of the memory to which to store. + SDValue low = DAG.getLoad(vecVT, dl, the_chain, basePtr, + lowMemPtr, SN->isVolatile(), SN->isNonTemporal(), 16); + // if we don't need to store over the 16 byte boundary, one store suffices + if (alignment >= StVT.getSizeInBits()/8) { // Update the chain - the_chain = alignLoadVec.getValue(1); + the_chain = low.getValue(1); - LoadSDNode *LN = cast(alignLoadVec); + LoadSDNode *LN = cast(low); SDValue theValue = SN->getValue(); - SDValue result; if (StVT != VT && (theValue.getOpcode() == ISD::AssertZext @@ -843,50 +877,116 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { } #endif - SDValue insertEltOp = - DAG.getNode(SPUISD::SHUFFLE_MASK, dl, vecVT, insertEltOffs); - SDValue vectorizeOp = - DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, vecVT, theValue); + SDValue insertEltOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, vecVT, + insertEltOffs); + SDValue vectorizeOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, vecVT, + theValue); result = DAG.getNode(SPUISD::SHUFB, dl, vecVT, - vectorizeOp, alignLoadVec, - DAG.getNode(ISD::BIT_CONVERT, dl, + vectorizeOp, low, + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, insertEltOp)); result = DAG.getStore(the_chain, dl, result, basePtr, - LN->getSrcValue(), LN->getSrcValueOffset(), + lowMemPtr, LN->isVolatile(), LN->isNonTemporal(), - LN->getAlignment()); - -#if 0 && !defined(NDEBUG) - if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) { - const SDValue ¤tRoot = DAG.getRoot(); - - DAG.setRoot(result); - errs() << "------- CellSPU:LowerStore result:\n"; - DAG.dump(); - errs() << "-------\n"; - DAG.setRoot(currentRoot); - } -#endif - - return result; - /*UNREACHED*/ - } - case ISD::PRE_INC: - case ISD::PRE_DEC: - case ISD::POST_INC: - case ISD::POST_DEC: - case ISD::LAST_INDEXED_MODE: - { - report_fatal_error("LowerLOAD: Got a LoadSDNode with an addr mode other " - "than UNINDEXED\n" + - Twine((unsigned)SN->getAddressingMode())); - /*NOTREACHED*/ - } + 16); + + } + // do the store when it might cross the 16 byte memory access boundary. + else { + // TODO issue a warning if SN->isVolatile()== true? This is likely not + // what the user wanted. + + // address offset from nearest lower 16byte alinged address + SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32, + SN->getBasePtr(), + DAG.getConstant(0xf, MVT::i32)); + // 16 - offset + SDValue offset_compl = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant( 16, MVT::i32), + offset); + // 16 - sizeof(Value) + SDValue surplus = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant( 16, MVT::i32), + DAG.getConstant( VT.getSizeInBits()/8, + MVT::i32)); + // get a registerfull of ones + SDValue ones = DAG.getConstant(-1, MVT::v4i32); + ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones); + + // Create the 128 bit masks that have ones where the data to store is + // located. + SDValue lowmask, himask; + // if the value to store don't fill up the an entire 128 bits, zero + // out the last bits of the mask so that only the value we want to store + // is masked. + // this is e.g. in the case of store i32, align 2 + if (!VT.isVector()){ + Value = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, Value); + lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, ones, surplus); + lowmask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask, + surplus); + Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value); + Value = DAG.getNode(ISD::AND, dl, MVT::i128, Value, lowmask); + + } + else { + lowmask = ones; + Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value); + } + // this will zero, if there are no data that goes to the high quad + himask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask, + offset_compl); + lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, lowmask, + offset); + + // Load in the old data and zero out the parts that will be overwritten with + // the new data to store. + SDValue hi = DAG.getLoad(MVT::i128, dl, the_chain, + DAG.getNode(ISD::ADD, dl, PtrVT, basePtr, + DAG.getConstant( 16, PtrVT)), + highMemPtr, + SN->isVolatile(), SN->isNonTemporal(), 16); + the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1), + hi.getValue(1)); + + low = DAG.getNode(ISD::AND, dl, MVT::i128, + DAG.getNode( ISD::BITCAST, dl, MVT::i128, low), + DAG.getNode( ISD::XOR, dl, MVT::i128, lowmask, ones)); + hi = DAG.getNode(ISD::AND, dl, MVT::i128, + DAG.getNode( ISD::BITCAST, dl, MVT::i128, hi), + DAG.getNode( ISD::XOR, dl, MVT::i128, himask, ones)); + + // Shift the Value to store into place. rlow contains the parts that go to + // the lower memory chunk, rhi has the parts that go to the upper one. + SDValue rlow = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, Value, offset); + rlow = DAG.getNode(ISD::AND, dl, MVT::i128, rlow, lowmask); + SDValue rhi = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, Value, + offset_compl); + + // Merge the old data and the new data and store the results + // Need to convert vectors here to integer as 'OR'ing floats assert + rlow = DAG.getNode(ISD::OR, dl, MVT::i128, + DAG.getNode(ISD::BITCAST, dl, MVT::i128, low), + DAG.getNode(ISD::BITCAST, dl, MVT::i128, rlow)); + rhi = DAG.getNode(ISD::OR, dl, MVT::i128, + DAG.getNode(ISD::BITCAST, dl, MVT::i128, hi), + DAG.getNode(ISD::BITCAST, dl, MVT::i128, rhi)); + + low = DAG.getStore(the_chain, dl, rlow, basePtr, + lowMemPtr, + SN->isVolatile(), SN->isNonTemporal(), 16); + hi = DAG.getStore(the_chain, dl, rhi, + DAG.getNode(ISD::ADD, dl, PtrVT, basePtr, + DAG.getConstant( 16, PtrVT)), + highMemPtr, + SN->isVolatile(), SN->isNonTemporal(), 16); + result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(0), + hi.getValue(0)); } - return SDValue(); + return result; } //! Generate the address of a constant pool entry. @@ -953,7 +1053,8 @@ LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { EVT PtrVT = Op.getValueType(); GlobalAddressSDNode *GSDN = cast(Op); const GlobalValue *GV = GSDN->getGlobal(); - SDValue GA = DAG.getTargetGlobalAddress(GV, PtrVT, GSDN->getOffset()); + SDValue GA = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), + PtrVT, GSDN->getOffset()); const TargetMachine &TM = DAG.getTarget(); SDValue Zero = DAG.getConstant(0, PtrVT); // FIXME there is no actual debug info here @@ -993,7 +1094,7 @@ LowerConstantFP(SDValue Op, SelectionDAG &DAG) { SDValue T = DAG.getConstant(dbits, MVT::i64); SDValue Tvec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T); return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Tvec)); + DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Tvec)); } return SDValue(); @@ -1013,22 +1114,26 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain, MachineRegisterInfo &RegInfo = MF.getRegInfo(); SPUFunctionInfo *FuncInfo = MF.getInfo(); - const unsigned *ArgRegs = SPURegisterInfo::getArgRegs(); - const unsigned NumArgRegs = SPURegisterInfo::getNumArgRegs(); - - unsigned ArgOffset = SPUFrameInfo::minStackSize(); + unsigned ArgOffset = SPUFrameLowering::minStackSize(); unsigned ArgRegIdx = 0; - unsigned StackSlotSize = SPUFrameInfo::stackSlotSize(); + unsigned StackSlotSize = SPUFrameLowering::stackSlotSize(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + SmallVector ArgLocs; + CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs, + *DAG.getContext()); + // FIXME: allow for other calling conventions + CCInfo.AnalyzeFormalArguments(Ins, CCC_SPU); + // Add DAG nodes to load the arguments or copy them out of registers. for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { EVT ObjectVT = Ins[ArgNo].VT; unsigned ObjSize = ObjectVT.getSizeInBits()/8; SDValue ArgVal; + CCValAssign &VA = ArgLocs[ArgNo]; - if (ArgRegIdx < NumArgRegs) { + if (VA.isRegLoc()) { const TargetRegisterClass *ArgRegClass; switch (ObjectVT.getSimpleVT().SimpleTy) { @@ -1067,16 +1172,17 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain, } unsigned VReg = RegInfo.createVirtualRegister(ArgRegClass); - RegInfo.addLiveIn(ArgRegs[ArgRegIdx], VReg); + RegInfo.addLiveIn(VA.getLocReg(), VReg); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); ++ArgRegIdx; } else { // We need to load the argument to a virtual register if we determined // above that we ran out of physical registers of the appropriate type // or we're forced to do vararg - int FI = MFI->CreateFixedObject(ObjSize, ArgOffset, true, false); + int FI = MFI->CreateFixedObject(ObjSize, ArgOffset, true); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, NULL, 0, false, false, 0); + ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(), + false, false, 0); ArgOffset += StackSlotSize; } @@ -1087,20 +1193,35 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain, // vararg handling: if (isVarArg) { - // unsigned int ptr_size = PtrVT.getSizeInBits() / 8; + // FIXME: we should be able to query the argument registers from + // tablegen generated code. + static const unsigned ArgRegs[] = { + SPU::R3, SPU::R4, SPU::R5, SPU::R6, SPU::R7, SPU::R8, SPU::R9, + SPU::R10, SPU::R11, SPU::R12, SPU::R13, SPU::R14, SPU::R15, SPU::R16, + SPU::R17, SPU::R18, SPU::R19, SPU::R20, SPU::R21, SPU::R22, SPU::R23, + SPU::R24, SPU::R25, SPU::R26, SPU::R27, SPU::R28, SPU::R29, SPU::R30, + SPU::R31, SPU::R32, SPU::R33, SPU::R34, SPU::R35, SPU::R36, SPU::R37, + SPU::R38, SPU::R39, SPU::R40, SPU::R41, SPU::R42, SPU::R43, SPU::R44, + SPU::R45, SPU::R46, SPU::R47, SPU::R48, SPU::R49, SPU::R50, SPU::R51, + SPU::R52, SPU::R53, SPU::R54, SPU::R55, SPU::R56, SPU::R57, SPU::R58, + SPU::R59, SPU::R60, SPU::R61, SPU::R62, SPU::R63, SPU::R64, SPU::R65, + SPU::R66, SPU::R67, SPU::R68, SPU::R69, SPU::R70, SPU::R71, SPU::R72, + SPU::R73, SPU::R74, SPU::R75, SPU::R76, SPU::R77, SPU::R78, SPU::R79 + }; + // size of ArgRegs array + unsigned NumArgRegs = 77; + // We will spill (79-3)+1 registers to the stack SmallVector MemOps; // Create the frame slot - for (; ArgRegIdx != NumArgRegs; ++ArgRegIdx) { FuncInfo->setVarArgsFrameIndex( - MFI->CreateFixedObject(StackSlotSize, ArgOffset, - true, false)); + MFI->CreateFixedObject(StackSlotSize, ArgOffset, true)); SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::R32CRegClass); SDValue ArgVal = DAG.getRegister(VReg, MVT::v16i8); - SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, NULL, 0, + SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, MachinePointerInfo(), false, false, 0); Chain = Store.getOperand(0); MemOps.push_back(Store); @@ -1135,6 +1256,7 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, bool &isTailCall, const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, DebugLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { @@ -1143,9 +1265,16 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, const SPUSubtarget *ST = SPUTM.getSubtargetImpl(); unsigned NumOps = Outs.size(); - unsigned StackSlotSize = SPUFrameInfo::stackSlotSize(); - const unsigned *ArgRegs = SPURegisterInfo::getArgRegs(); - const unsigned NumArgRegs = SPURegisterInfo::getNumArgRegs(); + unsigned StackSlotSize = SPUFrameLowering::stackSlotSize(); + + SmallVector ArgLocs; + CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs, + *DAG.getContext()); + // FIXME: allow for other calling conventions + CCInfo.AnalyzeCallOperands(Outs, CCC_SPU); + + const unsigned NumArgRegs = ArgLocs.size(); + // Handy pointer type EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); @@ -1157,7 +1286,7 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Figure out which arguments are going to go in registers, and which in // memory. - unsigned ArgOffset = SPUFrameInfo::minStackSize(); // Just below [LR] + unsigned ArgOffset = SPUFrameLowering::minStackSize(); // Just below [LR] unsigned ArgRegIdx = 0; // Keep track of registers passing arguments @@ -1165,8 +1294,9 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // And the arguments passed on the stack SmallVector MemOpChains; - for (unsigned i = 0; i != NumOps; ++i) { - SDValue Arg = Outs[i].Val; + for (; ArgRegIdx != NumOps; ++ArgRegIdx) { + SDValue Arg = OutVals[ArgRegIdx]; + CCValAssign &VA = ArgLocs[ArgRegIdx]; // PtrOff will be used to store the current argument to the stack if a // register cannot be found for it. @@ -1180,24 +1310,8 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, case MVT::i32: case MVT::i64: case MVT::i128: - if (ArgRegIdx != NumArgRegs) { - RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg)); - } else { - MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0, - false, false, 0)); - ArgOffset += StackSlotSize; - } - break; case MVT::f32: case MVT::f64: - if (ArgRegIdx != NumArgRegs) { - RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg)); - } else { - MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0, - false, false, 0)); - ArgOffset += StackSlotSize; - } - break; case MVT::v2i64: case MVT::v2f64: case MVT::v4f32: @@ -1205,9 +1319,10 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, case MVT::v8i16: case MVT::v16i8: if (ArgRegIdx != NumArgRegs) { - RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg)); + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); } else { - MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0, + MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, + MachinePointerInfo(), false, false, 0)); ArgOffset += StackSlotSize; } @@ -1218,7 +1333,7 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Accumulate how many bytes are to be pushed on the stack, including the // linkage area, and parameter passing area. According to the SPU ABI, // we minimally need space for [LR] and [SP]. - unsigned NumStackBytes = ArgOffset - SPUFrameInfo::minStackSize(); + unsigned NumStackBytes = ArgOffset - SPUFrameLowering::minStackSize(); // Insert a call sequence start Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumStackBytes, @@ -1249,7 +1364,7 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, const GlobalValue *GV = G->getGlobal(); EVT CalleeVT = Callee.getValueType(); SDValue Zero = DAG.getConstant(0, PtrVT); - SDValue GA = DAG.getTargetGlobalAddress(GV, CalleeVT); + SDValue GA = DAG.getTargetGlobalAddress(GV, dl, CalleeVT); if (!ST->usingLargeMem()) { // Turn calls to targets that are defined (i.e., have bodies) into BRSL @@ -1299,7 +1414,7 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (InFlag.getNode()) Ops.push_back(InFlag); // Returns a chain and a flag for retval copy to use. - Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Flag), + Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Glue), &Ops[0], Ops.size()); InFlag = Chain.getValue(1); @@ -1312,41 +1427,23 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (Ins.empty()) return Chain; + // Now handle the return value(s) + SmallVector RVLocs; + CCState CCRetInfo(CallConv, isVarArg, getTargetMachine(), + RVLocs, *DAG.getContext()); + CCRetInfo.AnalyzeCallResult(Ins, CCC_SPU); + + // If the call has results, copy the values out of the ret val registers. - switch (Ins[0].VT.getSimpleVT().SimpleTy) { - default: llvm_unreachable("Unexpected ret value!"); - case MVT::Other: break; - case MVT::i32: - if (Ins.size() > 1 && Ins[1].VT == MVT::i32) { - Chain = DAG.getCopyFromReg(Chain, dl, SPU::R4, - MVT::i32, InFlag).getValue(1); - InVals.push_back(Chain.getValue(0)); - Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i32, - Chain.getValue(2)).getValue(1); - InVals.push_back(Chain.getValue(0)); - } else { - Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i32, - InFlag).getValue(1); - InVals.push_back(Chain.getValue(0)); - } - break; - case MVT::i8: - case MVT::i16: - case MVT::i64: - case MVT::i128: - case MVT::f32: - case MVT::f64: - case MVT::v2f64: - case MVT::v2i64: - case MVT::v4f32: - case MVT::v4i32: - case MVT::v8i16: - case MVT::v16i8: - Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, Ins[0].VT, - InFlag).getValue(1); - InVals.push_back(Chain.getValue(0)); - break; - } + for (unsigned i = 0; i != RVLocs.size(); ++i) { + CCValAssign VA = RVLocs[i]; + + SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), + InFlag); + Chain = Val.getValue(1); + InFlag = Val.getValue(2); + InVals.push_back(Val); + } return Chain; } @@ -1355,6 +1452,7 @@ SDValue SPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, DebugLoc dl, SelectionDAG &DAG) const { SmallVector RVLocs; @@ -1376,7 +1474,7 @@ SPUTargetLowering::LowerReturn(SDValue Chain, CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), - Outs[i].Val, Flag); + OutVals[i], Flag); Flag = Chain.getValue(1); } @@ -1572,7 +1670,7 @@ LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { && "LowerBUILD_VECTOR: Unexpected floating point vector element."); // NOTE: pretend the constant is an integer. LLVM won't load FP constants SDValue T = DAG.getConstant(Value32, MVT::i32); - return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, + return DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, T,T,T,T)); break; } @@ -1582,7 +1680,7 @@ LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { && "LowerBUILD_VECTOR: 64-bit float vector size > 8 bytes."); // NOTE: pretend the constant is an integer. LLVM won't load FP constants SDValue T = DAG.getConstant(f64val, MVT::i64); - return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, + return DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T)); break; } @@ -1592,7 +1690,7 @@ LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { SmallVector Ops; Ops.assign(8, DAG.getConstant(Value16, MVT::i16)); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, + return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i16, &Ops[0], Ops.size())); } case MVT::v8i16: { @@ -1607,10 +1705,6 @@ LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType()); return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T, T, T); } - case MVT::v2i32: { - SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType()); - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T); - } case MVT::v2i64: { return SPU::LowerV2I64Splat(VT, DAG, SplatBits, dl); } @@ -1630,7 +1724,7 @@ SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal, if (upper == lower) { // Magic constant that can be matched by IL, ILA, et. al. SDValue Val = DAG.getTargetConstant(upper, MVT::i32); - return DAG.getNode(ISD::BIT_CONVERT, dl, OpVT, + return DAG.getNode(ISD::BITCAST, dl, OpVT, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Val, Val, Val, Val)); } else { @@ -1659,7 +1753,7 @@ SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal, // Create lower vector if not a special pattern if (!lower_special) { SDValue LO32C = DAG.getConstant(lower, MVT::i32); - LO32 = DAG.getNode(ISD::BIT_CONVERT, dl, OpVT, + LO32 = DAG.getNode(ISD::BITCAST, dl, OpVT, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, LO32C, LO32C, LO32C, LO32C)); } @@ -1667,7 +1761,7 @@ SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal, // Create upper vector if not a special pattern if (!upper_special) { SDValue HI32C = DAG.getConstant(upper, MVT::i32); - HI32 = DAG.getNode(ISD::BIT_CONVERT, dl, OpVT, + HI32 = DAG.getNode(ISD::BITCAST, dl, OpVT, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, HI32C, HI32C, HI32C, HI32C)); } @@ -1734,23 +1828,24 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { // If we have a single element being moved from V1 to V2, this can be handled // using the C*[DX] compute mask instructions, but the vector elements have - // to be monotonically increasing with one exception element. + // to be monotonically increasing with one exception element, and the source + // slot of the element to move must be the same as the destination. EVT VecVT = V1.getValueType(); EVT EltVT = VecVT.getVectorElementType(); unsigned EltsFromV2 = 0; - unsigned V2Elt = 0; + unsigned V2EltOffset = 0; unsigned V2EltIdx0 = 0; unsigned CurrElt = 0; unsigned MaxElts = VecVT.getVectorNumElements(); unsigned PrevElt = 0; - unsigned V0Elt = 0; bool monotonic = true; bool rotate = true; + int rotamt=0; EVT maskVT; // which of the c?d instructions to use if (EltVT == MVT::i8) { V2EltIdx0 = 16; - maskVT = MVT::v16i8; + maskVT = MVT::v16i8; } else if (EltVT == MVT::i16) { V2EltIdx0 = 8; maskVT = MVT::v8i16; @@ -1766,14 +1861,18 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { for (unsigned i = 0; i != MaxElts; ++i) { if (SVN->getMaskElt(i) < 0) continue; - + unsigned SrcElt = SVN->getMaskElt(i); if (monotonic) { if (SrcElt >= V2EltIdx0) { - if (1 >= (++EltsFromV2)) { - V2Elt = (V2EltIdx0 - SrcElt) << 2; - } + // TODO: optimize for the monotonic case when several consecutive + // elements are taken form V2. Do we ever get such a case? + if (EltsFromV2 == 0 && CurrElt == (SrcElt - V2EltIdx0)) + V2EltOffset = (SrcElt - V2EltIdx0) * (EltVT.getSizeInBits()/8); + else + monotonic = false; + ++EltsFromV2; } else if (CurrElt != SrcElt) { monotonic = false; } @@ -1786,13 +1885,12 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { if ((PrevElt == SrcElt - 1) || (PrevElt == MaxElts - 1 && SrcElt == 0)) { PrevElt = SrcElt; - if (SrcElt == 0) - V0Elt = i; } else { rotate = false; } - } else if (i == 0) { - // First time through, need to keep track of previous element + } else if (i == 0 || (PrevElt==0 && SrcElt==1)) { + // First time or after a "wrap around" + rotamt = SrcElt-i; PrevElt = SrcElt; } else { // This isn't a rotation, takes elements from vector 2 @@ -1803,24 +1901,23 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { if (EltsFromV2 == 1 && monotonic) { // Compute mask and shuffle - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &RegInfo = MF.getRegInfo(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); // As SHUFFLE_MASK becomes a c?d instruction, feed it an address // R1 ($sp) is used here only as it is guaranteed to have last bits zero SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, DAG.getRegister(SPU::R1, PtrVT), - DAG.getConstant(V2Elt, MVT::i32)); - SDValue ShufMaskOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, + DAG.getConstant(V2EltOffset, MVT::i32)); + SDValue ShufMaskOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, maskVT, Pointer); // Use shuffle mask in SHUFB synthetic instruction: return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V2, V1, ShufMaskOp); } else if (rotate) { - int rotamt = (MaxElts - V0Elt) * EltVT.getSizeInBits()/8; - + if (rotamt < 0) + rotamt +=MaxElts; + rotamt *= EltVT.getSizeInBits()/8; return DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, V1.getValueType(), V1, DAG.getConstant(rotamt, MVT::i16)); } else { @@ -1835,7 +1932,6 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { for (unsigned j = 0; j < BytesPerElement; ++j) ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,MVT::i8)); } - SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8, &ResultMask[0], ResultMask.size()); return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V1, V2, VPermMask); @@ -1985,7 +2081,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { // Variable index: Rotate the requested element into slot 0, then replicate // slot 0 across the vector EVT VecVT = N.getValueType(); - if (!VecVT.isSimple() || !VecVT.isVector() || !VecVT.is128BitVector()) { + if (!VecVT.isSimple() || !VecVT.isVector()) { report_fatal_error("LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit" "vector type!"); } @@ -2006,7 +2102,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { DAG.getConstant(scaleShift, MVT::i32)); } - vecShift = DAG.getNode(SPUISD::SHLQUAD_L_BYTES, dl, VecVT, N, Elt); + vecShift = DAG.getNode(SPUISD::SHL_BYTES, dl, VecVT, N, Elt); // Replicate the bytes starting at byte 0 across the entire vector (for // consistency with the notion of a unified register set) @@ -2060,27 +2156,31 @@ static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { SDValue IdxOp = Op.getOperand(2); DebugLoc dl = Op.getDebugLoc(); EVT VT = Op.getValueType(); + EVT eltVT = ValOp.getValueType(); // use 0 when the lane to insert to is 'undef' - int64_t Idx=0; + int64_t Offset=0; if (IdxOp.getOpcode() != ISD::UNDEF) { ConstantSDNode *CN = cast(IdxOp); assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!"); - Idx = (CN->getSExtValue()); + Offset = (CN->getSExtValue()) * eltVT.getSizeInBits()/8; } EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); // Use $sp ($1) because it's always 16-byte aligned and it's available: SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, DAG.getRegister(SPU::R1, PtrVT), - DAG.getConstant(Idx, PtrVT)); - SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, VT, Pointer); + DAG.getConstant(Offset, PtrVT)); + // widen the mask when dealing with half vectors + EVT maskVT = EVT::getVectorVT(*(DAG.getContext()), VT.getVectorElementType(), + 128/ VT.getVectorElementType().getSizeInBits()); + SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, maskVT, Pointer); SDValue result = DAG.getNode(SPUISD::SHUFB, dl, VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, ValOp), VecOp, - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, ShufMask)); + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ShufMask)); return result; } @@ -2200,12 +2300,12 @@ LowerByteImmed(SDValue Op, SelectionDAG &DAG) { ConstVec = Op.getOperand(0); Arg = Op.getOperand(1); if (ConstVec.getNode()->getOpcode() != ISD::BUILD_VECTOR) { - if (ConstVec.getNode()->getOpcode() == ISD::BIT_CONVERT) { + if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) { ConstVec = ConstVec.getOperand(0); } else { ConstVec = Op.getOperand(1); Arg = Op.getOperand(0); - if (ConstVec.getNode()->getOpcode() == ISD::BIT_CONVERT) { + if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) { ConstVec = ConstVec.getOperand(0); } } @@ -2246,7 +2346,7 @@ LowerByteImmed(SDValue Op, SelectionDAG &DAG) { */ static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); - EVT vecVT = EVT::getVectorVT(*DAG.getContext(), + EVT vecVT = EVT::getVectorVT(*DAG.getContext(), VT, (128 / VT.getSizeInBits())); DebugLoc dl = Op.getDebugLoc(); @@ -2422,7 +2522,7 @@ static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG, // Take advantage of the fact that (truncate (sra arg, 32)) is efficiently // selected to a NOP: - SDValue i64lhs = DAG.getNode(ISD::BIT_CONVERT, dl, IntVT, lhs); + SDValue i64lhs = DAG.getNode(ISD::BITCAST, dl, IntVT, lhs); SDValue lhsHi32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, DAG.getNode(ISD::SRL, dl, IntVT, @@ -2456,7 +2556,7 @@ static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG, ISD::SETGT)); } - SDValue i64rhs = DAG.getNode(ISD::BIT_CONVERT, dl, IntVT, rhs); + SDValue i64rhs = DAG.getNode(ISD::BITCAST, dl, IntVT, rhs); SDValue rhsHi32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, DAG.getNode(ISD::SRL, dl, IntVT, @@ -2570,7 +2670,7 @@ static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) // Type to truncate to EVT VT = Op.getValueType(); MVT simpleVT = VT.getSimpleVT(); - EVT VecVT = EVT::getVectorVT(*DAG.getContext(), + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), VT, (128 / VT.getSizeInBits())); DebugLoc dl = Op.getDebugLoc(); @@ -2578,7 +2678,7 @@ static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) SDValue Op0 = Op.getOperand(0); EVT Op0VT = Op0.getValueType(); - if (Op0VT.getSimpleVT() == MVT::i128 && simpleVT == MVT::i64) { + if (Op0VT == MVT::i128 && simpleVT == MVT::i64) { // Create shuffle mask, least significant doubleword of quadword unsigned maskHigh = 0x08090a0b; unsigned maskLow = 0x0c0d0e0f; @@ -2643,12 +2743,17 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) DAG.getNode(SPUISD::PREFSLOT2VEC, dl, mvt, Op0, Op0), DAG.getConstant(31, MVT::i32)); + // reinterpret as a i128 (SHUFB requires it). This gets lowered away. + SDValue extended = SDValue(DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, + dl, Op0VT, Op0, + DAG.getTargetConstant( + SPU::GPRCRegClass.getID(), + MVT::i32)), 0); // Shuffle bytes - Copy the sign bits into the upper 64 bits // and the input value into the lower 64 bits. SDValue extShuffle = DAG.getNode(SPUISD::SHUFB, dl, mvt, - DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i128, Op0), sraVal, shufMask); - - return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, extShuffle); + extended, sraVal, shufMask); + return DAG.getNode(ISD::BITCAST, dl, MVT::i128, extShuffle); } //! Custom (target-specific) lowering entry point @@ -2906,8 +3011,8 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const } break; } - case SPUISD::SHLQUAD_L_BITS: - case SPUISD::SHLQUAD_L_BYTES: + case SPUISD::SHL_BITS: + case SPUISD::SHL_BYTES: case SPUISD::ROTBYTES_LEFT: { SDValue Op1 = N->getOperand(1); @@ -2985,6 +3090,38 @@ SPUTargetLowering::getConstraintType(const std::string &ConstraintLetter) const return TargetLowering::getConstraintType(ConstraintLetter); } +/// Examine constraint type and operand type and determine a weight value. +/// This object must already have been set up with the operand type +/// and the current alternative constraint selected. +TargetLowering::ConstraintWeight +SPUTargetLowering::getSingleConstraintMatchWeight( + AsmOperandInfo &info, const char *constraint) const { + ConstraintWeight weight = CW_Invalid; + Value *CallOperandVal = info.CallOperandVal; + // If we don't have a value, we can't do a match, + // but allow it at the lowest weight. + if (CallOperandVal == NULL) + return CW_Default; + // Look at the constraint type. + switch (*constraint) { + default: + weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); + break; + //FIXME: Seems like the supported constraint letters were just copied + // from PPC, as the following doesn't correspond to the GCC docs. + // I'm leaving it so until someone adds the corresponding lowering support. + case 'b': + case 'r': + case 'f': + case 'd': + case 'v': + case 'y': + weight = CW_Register; + break; + } + return weight; +} + std::pair SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const @@ -3066,12 +3203,10 @@ SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, void SPUTargetLowering::LowerAsmOperandForConstraint(SDValue Op, char ConstraintLetter, - bool hasMemory, std::vector &Ops, SelectionDAG &DAG) const { // Default, for the time being, to the base class handler - TargetLowering::LowerAsmOperandForConstraint(Op, ConstraintLetter, hasMemory, - Ops, DAG); + TargetLowering::LowerAsmOperandForConstraint(Op, ConstraintLetter, Ops, DAG); } /// isLegalAddressImmediate - Return true if the integer value can be used @@ -3091,3 +3226,28 @@ SPUTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // The SPU target isn't yet aware of offsets. return false; } + +// can we compare to Imm without writing it into a register? +bool SPUTargetLowering::isLegalICmpImmediate(int64_t Imm) const { + //ceqi, cgti, etc. all take s10 operand + return isInt<10>(Imm); +} + +bool +SPUTargetLowering::isLegalAddressingMode(const AddrMode &AM, + const Type * ) const{ + + // A-form: 18bit absolute address. + if (AM.BaseGV && !AM.HasBaseReg && AM.Scale == 0 && AM.BaseOffs == 0) + return true; + + // D-form: reg + 14bit offset + if (AM.BaseGV ==0 && AM.HasBaseReg && AM.Scale == 0 && isInt<14>(AM.BaseOffs)) + return true; + + // X-form: reg+reg + if (AM.BaseGV == 0 && AM.HasBaseReg && AM.Scale == 1 && AM.BaseOffs ==0) + return true; + + return false; +}