From: Evan Cheng Date: Thu, 1 Apr 2010 06:04:33 +0000 (+0000) Subject: Fix sdisel memcpy, memset, memmove lowering: X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=255f20f7f76e4ca1ac1c73294852cb6fcb18c77d;p=oota-llvm.git Fix sdisel memcpy, memset, memmove lowering: 1. Makes it possible to lower with floating point loads and stores. 2. Avoid unaligned loads / stores unless it's fast. 3. Fix some memcpy lowering logic bug related to when to optimize a load from constant string into a constant. 4. Adjust x86 memcpy lowering threshold to make it more sane. 5. Fix x86 target hook so it uses vector and floating point memory ops more effectively. rdar://7774704 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@100090 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index dd04785fe85..23506c9feb9 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -522,7 +522,7 @@ public: /// counterpart (e.g. structs), otherwise it will assert. EVT getValueType(const Type *Ty, bool AllowUnknown = false) const { EVT VT = EVT::getEVT(Ty, AllowUnknown); - return VT == MVT:: iPTR ? PointerTy : VT; + return VT == MVT::iPTR ? PointerTy : VT; } /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate @@ -636,8 +636,8 @@ public: /// and store operations as a result of memset, memcpy, and memmove lowering. /// It returns EVT::Other if SelectionDAG should be responsible for /// determining it. - virtual EVT getOptimalMemOpType(uint64_t Size, unsigned Align, - bool isSrcConst, bool isSrcStr, + virtual EVT getOptimalMemOpType(uint64_t Size, + unsigned DstAlign, unsigned SrcAlign, SelectionDAG &DAG) const { return MVT::Other; } diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index aa283ad894e..a336e0a01b5 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -5022,18 +5022,6 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) { SDValue Chain = LD->getChain(); SDValue Ptr = LD->getBasePtr(); - // Try to infer better alignment information than the load already has. - if (OptLevel != CodeGenOpt::None && LD->isUnindexed()) { - if (unsigned Align = DAG.InferPtrAlignment(Ptr)) { - if (Align > LD->getAlignment()) - return DAG.getExtLoad(LD->getExtensionType(), N->getDebugLoc(), - LD->getValueType(0), - Chain, Ptr, LD->getSrcValue(), - LD->getSrcValueOffset(), LD->getMemoryVT(), - LD->isVolatile(), LD->isNonTemporal(), Align); - } - } - // If load is not volatile and there are no uses of the loaded value (and // the updated indexed value in case of indexed loads), change uses of the // chain value into uses of the chain input (i.e. delete the dead load). @@ -5099,6 +5087,18 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) { } } + // Try to infer better alignment information than the load already has. + if (OptLevel != CodeGenOpt::None && LD->isUnindexed()) { + if (unsigned Align = DAG.InferPtrAlignment(Ptr)) { + if (Align > LD->getAlignment()) + return DAG.getExtLoad(LD->getExtensionType(), N->getDebugLoc(), + LD->getValueType(0), + Chain, Ptr, LD->getSrcValue(), + LD->getSrcValueOffset(), LD->getMemoryVT(), + LD->isVolatile(), LD->isNonTemporal(), Align); + } + } + if (CombinerAA) { // Walk up chain skipping non-aliasing memory nodes. SDValue BetterChain = FindBetterChain(N, Chain); @@ -5250,17 +5250,6 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { SDValue Value = ST->getValue(); SDValue Ptr = ST->getBasePtr(); - // Try to infer better alignment information than the store already has. - if (OptLevel != CodeGenOpt::None && ST->isUnindexed()) { - if (unsigned Align = DAG.InferPtrAlignment(Ptr)) { - if (Align > ST->getAlignment()) - return DAG.getTruncStore(Chain, N->getDebugLoc(), Value, - Ptr, ST->getSrcValue(), - ST->getSrcValueOffset(), ST->getMemoryVT(), - ST->isVolatile(), ST->isNonTemporal(), Align); - } - } - // If this is a store of a bit convert, store the input value if the // resultant store does not need a higher alignment than the original. if (Value.getOpcode() == ISD::BIT_CONVERT && !ST->isTruncatingStore() && @@ -5351,6 +5340,17 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { } } + // Try to infer better alignment information than the store already has. + if (OptLevel != CodeGenOpt::None && ST->isUnindexed()) { + if (unsigned Align = DAG.InferPtrAlignment(Ptr)) { + if (Align > ST->getAlignment()) + return DAG.getTruncStore(Chain, N->getDebugLoc(), Value, + Ptr, ST->getSrcValue(), + ST->getSrcValueOffset(), ST->getMemoryVT(), + ST->isVolatile(), ST->isNonTemporal(), Align); + } + } + if (CombinerAA) { // Walk up chain skipping non-aliasing memory nodes. SDValue BetterChain = FindBetterChain(N, Chain); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 4a207ede91e..25bf8612869 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3132,11 +3132,17 @@ static SDValue getMemsetStringVal(EVT VT, DebugLoc dl, SelectionDAG &DAG, if (Str.empty()) { if (VT.isInteger()) return DAG.getConstant(0, VT); - unsigned NumElts = VT.getVectorNumElements(); - MVT EltVT = (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64; - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, - DAG.getConstant(0, - EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts))); + else if (VT.getSimpleVT().SimpleTy == MVT::f32 || + VT.getSimpleVT().SimpleTy == MVT::f64) + return DAG.getConstantFP(0.0, VT); + else if (VT.isVector()) { + unsigned NumElts = VT.getVectorNumElements(); + MVT EltVT = (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64; + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, + DAG.getConstant(0, EVT::getVectorVT(*DAG.getContext(), + EltVT, NumElts))); + } else + llvm_unreachable("Expected type!"); } assert(!VT.isVector() && "Can't handle vector type here!"); @@ -3184,51 +3190,33 @@ static bool isMemSrcFromString(SDValue Src, std::string &Str) { return false; } -/// MeetsMaxMemopRequirement - Determines if the number of memory ops required -/// to replace the memset / memcpy is below the threshold. It also returns the -/// types of the sequence of memory ops to perform memset / memcpy. -static -bool MeetsMaxMemopRequirement(std::vector &MemOps, - SDValue Dst, SDValue Src, - unsigned Limit, uint64_t Size, unsigned &Align, - std::string &Str, bool &isSrcStr, - SelectionDAG &DAG, - const TargetLowering &TLI) { - isSrcStr = isMemSrcFromString(Src, Str); - bool isSrcConst = isa(Src); - EVT VT = TLI.getOptimalMemOpType(Size, Align, isSrcConst, isSrcStr, DAG); - bool AllowUnalign = TLI.allowsUnalignedMemoryAccesses(VT); - if (VT != MVT::Other) { - const Type *Ty = VT.getTypeForEVT(*DAG.getContext()); - unsigned NewAlign = (unsigned) TLI.getTargetData()->getABITypeAlignment(Ty); - // If source is a string constant, this will require an unaligned load. - if (NewAlign > Align && (isSrcConst || AllowUnalign)) { - if (Dst.getOpcode() != ISD::FrameIndex) { - // Can't change destination alignment. It requires a unaligned store. - if (AllowUnalign) - VT = MVT::Other; - } else { - int FI = cast(Dst)->getIndex(); - MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); - if (MFI->isFixedObjectIndex(FI)) { - // Can't change destination alignment. It requires a unaligned store. - if (AllowUnalign) - VT = MVT::Other; - } else { - // Give the stack frame object a larger alignment if needed. - if (MFI->getObjectAlignment(FI) < NewAlign) - MFI->setObjectAlignment(FI, NewAlign); - Align = NewAlign; - } - } - } - } +/// FindOptimalMemOpLowering - Determines the optimial series memory ops +/// to replace the memset / memcpy. Return true if the number of memory ops +/// is below the threshold. It returns the types of the sequence of +/// memory ops to perform memset / memcpy by reference. +static bool FindOptimalMemOpLowering(std::vector &MemOps, + SDValue Dst, SDValue Src, + unsigned Limit, uint64_t Size, + unsigned DstAlign, unsigned SrcAlign, + SelectionDAG &DAG, + const TargetLowering &TLI) { + assert((SrcAlign == 0 || SrcAlign >= DstAlign) && + "Expecting memcpy / memset source to meet alignment requirement!"); + // If 'SrcAlign' is zero, that means the memory operation does not need load + // the value, i.e. memset or memcpy from constant string. Otherwise, it's + // the inferred alignment of the source. 'DstAlign', on the other hand, is the + // specified alignment of the memory operation. If it is zero, that means + // it's possible to change the alignment of the destination. + EVT VT = TLI.getOptimalMemOpType(Size, DstAlign, SrcAlign, DAG); if (VT == MVT::Other) { - if (TLI.allowsUnalignedMemoryAccesses(MVT::i64)) { + VT = TLI.getPointerTy(); + const Type *Ty = VT.getTypeForEVT(*DAG.getContext()); + if (DstAlign >= TLI.getTargetData()->getABITypeAlignment(Ty) || + TLI.allowsUnalignedMemoryAccesses(VT)) { VT = MVT::i64; } else { - switch (Align & 7) { + switch (DstAlign & 7) { case 0: VT = MVT::i64; break; case 4: VT = MVT::i32; break; case 2: VT = MVT::i16; break; @@ -3250,7 +3238,7 @@ bool MeetsMaxMemopRequirement(std::vector &MemOps, unsigned VTSize = VT.getSizeInBits() / 8; while (VTSize > Size) { // For now, only use non-vector load / store's for the left-over pieces. - if (VT.isVector()) { + if (VT.isVector() || VT.isFloatingPoint()) { VT = MVT::i64; while (!TLI.isTypeLegal(VT)) VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1); @@ -3286,15 +3274,33 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl, uint64_t Limit = -1ULL; if (!AlwaysInline) Limit = TLI.getMaxStoresPerMemcpy(); - unsigned DstAlign = Align; // Destination alignment can change. + bool DstAlignCanChange = false; + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + FrameIndexSDNode *FI = dyn_cast(Dst); + if (FI && !MFI->isFixedObjectIndex(FI->getIndex())) + DstAlignCanChange = true; + unsigned SrcAlign = DAG.InferPtrAlignment(Src); + if (Align > SrcAlign) + SrcAlign = Align; std::string Str; - bool CopyFromStr; - if (!MeetsMaxMemopRequirement(MemOps, Dst, Src, Limit, Size, DstAlign, - Str, CopyFromStr, DAG, TLI)) + bool CopyFromStr = isMemSrcFromString(Src, Str); + bool isZeroStr = CopyFromStr && Str.empty(); + if (!FindOptimalMemOpLowering(MemOps, Dst, Src, Limit, Size, + (DstAlignCanChange ? 0 : Align), + (isZeroStr ? 0 : SrcAlign), DAG, TLI)) return SDValue(); + if (DstAlignCanChange) { + const Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext()); + unsigned NewAlign = (unsigned) TLI.getTargetData()->getABITypeAlignment(Ty); + if (NewAlign > Align) { + // Give the stack frame object a larger alignment if needed. + if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign) + MFI->setObjectAlignment(FI->getIndex(), NewAlign); + Align = NewAlign; + } + } - bool isZeroStr = CopyFromStr && Str.empty(); SmallVector OutChains; unsigned NumMemOps = MemOps.size(); uint64_t SrcOff = 0, DstOff = 0; @@ -3303,16 +3309,17 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl, unsigned VTSize = VT.getSizeInBits() / 8; SDValue Value, Store; - if (CopyFromStr && (isZeroStr || !VT.isVector())) { + if (CopyFromStr && + (isZeroStr || (VT.isInteger() && !VT.isVector()))) { // It's unlikely a store of a vector immediate can be done in a single // instruction. It would require a load from a constantpool first. - // We also handle store a vector with all zero's. + // We only handle zero vectors here. // FIXME: Handle other cases where store of vector immediate is done in // a single instruction. Value = getMemsetStringVal(VT, dl, DAG, TLI, Str, SrcOff); Store = DAG.getStore(Chain, dl, Value, getMemBasePlusOffset(Dst, DstOff, DAG), - DstSV, DstSVOff + DstOff, false, false, DstAlign); + DstSV, DstSVOff + DstOff, false, false, Align); } else { // The type might not be legal for the target. This should only happen // if the type is smaller than a legal type, as on PPC, so the right @@ -3323,11 +3330,12 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl, assert(NVT.bitsGE(VT)); Value = DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain, getMemBasePlusOffset(Src, SrcOff, DAG), - SrcSV, SrcSVOff + SrcOff, VT, false, false, Align); + SrcSV, SrcSVOff + SrcOff, VT, false, false, + MinAlign(SrcAlign, SrcOff)); Store = DAG.getTruncStore(Chain, dl, Value, getMemBasePlusOffset(Dst, DstOff, DAG), DstSV, DstSVOff + DstOff, VT, false, false, - DstAlign); + Align); } OutChains.push_back(Store); SrcOff += VTSize; @@ -3339,11 +3347,11 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl, } static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl, - SDValue Chain, SDValue Dst, - SDValue Src, uint64_t Size, - unsigned Align, bool AlwaysInline, - const Value *DstSV, uint64_t DstSVOff, - const Value *SrcSV, uint64_t SrcSVOff){ + SDValue Chain, SDValue Dst, + SDValue Src, uint64_t Size, + unsigned Align,bool AlwaysInline, + const Value *DstSV, uint64_t DstSVOff, + const Value *SrcSV, uint64_t SrcSVOff) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Expand memmove to a series of load and store ops if the size operand falls @@ -3352,15 +3360,32 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl, uint64_t Limit = -1ULL; if (!AlwaysInline) Limit = TLI.getMaxStoresPerMemmove(); - unsigned DstAlign = Align; // Destination alignment can change. - std::string Str; - bool CopyFromStr; - if (!MeetsMaxMemopRequirement(MemOps, Dst, Src, Limit, Size, DstAlign, - Str, CopyFromStr, DAG, TLI)) + bool DstAlignCanChange = false; + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + FrameIndexSDNode *FI = dyn_cast(Dst); + if (FI && !MFI->isFixedObjectIndex(FI->getIndex())) + DstAlignCanChange = true; + unsigned SrcAlign = DAG.InferPtrAlignment(Src); + if (Align > SrcAlign) + SrcAlign = Align; + + if (!FindOptimalMemOpLowering(MemOps, Dst, Src, Limit, Size, + (DstAlignCanChange ? 0 : Align), + SrcAlign, DAG, TLI)) return SDValue(); - uint64_t SrcOff = 0, DstOff = 0; + if (DstAlignCanChange) { + const Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext()); + unsigned NewAlign = (unsigned) TLI.getTargetData()->getABITypeAlignment(Ty); + if (NewAlign > Align) { + // Give the stack frame object a larger alignment if needed. + if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign) + MFI->setObjectAlignment(FI->getIndex(), NewAlign); + Align = NewAlign; + } + } + uint64_t SrcOff = 0, DstOff = 0; SmallVector LoadValues; SmallVector LoadChains; SmallVector OutChains; @@ -3372,7 +3397,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl, Value = DAG.getLoad(VT, dl, Chain, getMemBasePlusOffset(Src, SrcOff, DAG), - SrcSV, SrcSVOff + SrcOff, false, false, Align); + SrcSV, SrcSVOff + SrcOff, false, false, SrcAlign); LoadValues.push_back(Value); LoadChains.push_back(Value.getValue(1)); SrcOff += VTSize; @@ -3387,7 +3412,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl, Store = DAG.getStore(Chain, dl, LoadValues[i], getMemBasePlusOffset(Dst, DstOff, DAG), - DstSV, DstSVOff + DstOff, false, false, DstAlign); + DstSV, DstSVOff + DstOff, false, false, Align); OutChains.push_back(Store); DstOff += VTSize; } @@ -3397,24 +3422,38 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl, } static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl, - SDValue Chain, SDValue Dst, - SDValue Src, uint64_t Size, - unsigned Align, - const Value *DstSV, uint64_t DstSVOff) { + SDValue Chain, SDValue Dst, + SDValue Src, uint64_t Size, + unsigned Align, + const Value *DstSV, uint64_t DstSVOff) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Expand memset to a series of load/store ops if the size operand // falls below a certain threshold. std::vector MemOps; - std::string Str; - bool CopyFromStr; - if (!MeetsMaxMemopRequirement(MemOps, Dst, Src, TLI.getMaxStoresPerMemset(), - Size, Align, Str, CopyFromStr, DAG, TLI)) + bool DstAlignCanChange = false; + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + FrameIndexSDNode *FI = dyn_cast(Dst); + if (FI && !MFI->isFixedObjectIndex(FI->getIndex())) + DstAlignCanChange = true; + if (!FindOptimalMemOpLowering(MemOps, Dst, Src, TLI.getMaxStoresPerMemset(), + Size, (DstAlignCanChange ? 0 : Align), 0, + DAG, TLI)) return SDValue(); + if (DstAlignCanChange) { + const Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext()); + unsigned NewAlign = (unsigned) TLI.getTargetData()->getABITypeAlignment(Ty); + if (NewAlign > Align) { + // Give the stack frame object a larger alignment if needed. + if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign) + MFI->setObjectAlignment(FI->getIndex(), NewAlign); + Align = NewAlign; + } + } + SmallVector OutChains; uint64_t DstOff = 0; - unsigned NumMemOps = MemOps.size(); for (unsigned i = 0; i < NumMemOps; i++) { EVT VT = MemOps[i]; @@ -3445,10 +3484,9 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, DebugLoc dl, SDValue Dst, if (ConstantSize->isNullValue()) return Chain; - SDValue Result = - getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src, - ConstantSize->getZExtValue(), - Align, false, DstSV, DstSVOff, SrcSV, SrcSVOff); + SDValue Result = getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src, + ConstantSize->getZExtValue(),Align, + false, DstSV, DstSVOff, SrcSV, SrcSVOff); if (Result.getNode()) return Result; } @@ -6106,8 +6144,18 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const { // If this is a GlobalAddress + cst, return the alignment. GlobalValue *GV; int64_t GVOffset = 0; - if (TLI.isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) - return MinAlign(GV->getAlignment(), GVOffset); + if (TLI.isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) { + // If GV has specified alignment, then use it. Otherwise, use the preferred + // alignment. + unsigned Align = GV->getAlignment(); + if (!Align) { + if (GlobalVariable *GVar = dyn_cast(GV)) { + const TargetData *TD = TLI.getTargetData(); + Align = TD->getPreferredAlignment(GVar); + } + } + return MinAlign(Align, GVOffset); + } // If this is a direct reference to a stack slot, use information about the // stack slot's alignment. diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 2c072c1290f..d00fbff77a1 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -5539,8 +5539,8 @@ PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { return false; } -EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align, - bool isSrcConst, bool isSrcStr, +EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, + unsigned DstAlign, unsigned SrcAlign, SelectionDAG &DAG) const { if (this->PPCSubTarget.isPPC64()) { return MVT::i64; diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 9c390ac1014..2d5daefb333 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -347,8 +347,8 @@ namespace llvm { virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const; - virtual EVT getOptimalMemOpType(uint64_t Size, unsigned Align, - bool isSrcConst, bool isSrcStr, + virtual EVT getOptimalMemOpType(uint64_t Size, + unsigned DstAlign, unsigned SrcAlign, SelectionDAG &DAG) const; /// getFunctionAlignment - Return the Log2 alignment of this function. diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index b94f76efa6c..bd268eca8f5 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1012,7 +1012,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // FIXME: These should be based on subtarget info. Plus, the values should // be smaller when we are in optimizing for size mode. maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores - maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores + maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores setPrefLoopAlignment(16); benefitFromCodePlacementOpt = true; @@ -1074,19 +1074,27 @@ unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { /// lowering. It returns MVT::iAny if SelectionDAG should be responsible for /// determining it. EVT -X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align, - bool isSrcConst, bool isSrcStr, +X86TargetLowering::getOptimalMemOpType(uint64_t Size, + unsigned DstAlign, unsigned SrcAlign, SelectionDAG &DAG) const { // FIXME: This turns off use of xmm stores for memset/memcpy on targets like // linux. This is because the stack realignment code can't handle certain // cases like PR2962. This should be removed when PR2962 is fixed. const Function *F = DAG.getMachineFunction().getFunction(); - bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); - if (!NoImplicitFloatOps && Subtarget->getStackAlignment() >= 16) { - if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16) - return MVT::v4i32; - if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16) - return MVT::v4f32; + if (!F->hasFnAttr(Attribute::NoImplicitFloat)) { + if (Size >= 16 && + (Subtarget->isUnalignedMemAccessFast() || + (DstAlign == 0 || DstAlign >= 16) && + (SrcAlign == 0 || SrcAlign >= 16)) && + Subtarget->getStackAlignment() >= 16) { + if (Subtarget->hasSSE2()) + return MVT::v4i32; + if (Subtarget->hasSSE1()) + return MVT::v4f32; + } else if (Size >= 8 && + Subtarget->getStackAlignment() >= 8 && + Subtarget->hasSSE2()) + return MVT::f64; } if (Subtarget->is64Bit() && Size >= 8) return MVT::i64; diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 46fa3cefdc3..569dc1fbb25 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -423,8 +423,8 @@ namespace llvm { /// and store operations as a result of memset, memcpy, and memmove /// lowering. It returns EVT::iAny if SelectionDAG should be responsible for /// determining it. - virtual EVT getOptimalMemOpType(uint64_t Size, unsigned Align, - bool isSrcConst, bool isSrcStr, + virtual EVT getOptimalMemOpType(uint64_t Size, + unsigned DstAlign, unsigned SrcAlign, SelectionDAG &DAG) const; /// allowsUnalignedMemoryAccesses - Returns true if the target allows diff --git a/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll b/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll index 3ce9edb5db4..26bf09c723a 100644 --- a/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll +++ b/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s ; rdar://7396984 @str = private constant [28 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxx\00", align 1 diff --git a/test/CodeGen/X86/byval7.ll b/test/CodeGen/X86/byval7.ll index 0da93bad04e..686ed9c74dd 100644 --- a/test/CodeGen/X86/byval7.ll +++ b/test/CodeGen/X86/byval7.ll @@ -1,10 +1,17 @@ -; RUN: llc < %s -march=x86 -mcpu=yonah | egrep {add|lea} | grep 16 +; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck %s %struct.S = type { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, + <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } define i32 @main() nounwind { entry: +; CHECK: main: +; CHECK: movl $1, (%esp) +; CHECK: leal 16(%esp), %edi +; CHECK: movl $36, %ecx +; CHECK: leal 160(%esp), %esi +; CHECK: rep;movsl %s = alloca %struct.S ; <%struct.S*> [#uses=2] %tmp15 = getelementptr %struct.S* %s, i32 0, i32 0 ; <<2 x i64>*> [#uses=1] store <2 x i64> < i64 8589934595, i64 1 >, <2 x i64>* %tmp15, align 16 diff --git a/test/CodeGen/X86/memcpy-2.ll b/test/CodeGen/X86/memcpy-2.ll index 2dc939e666f..079c40252b7 100644 --- a/test/CodeGen/X86/memcpy-2.ll +++ b/test/CodeGen/X86/memcpy-2.ll @@ -1,15 +1,105 @@ -; RUN: llc < %s -march=x86 -mattr=-sse -mtriple=i686-apple-darwin8.8.0 | grep mov | count 7 -; RUN: llc < %s -march=x86 -mattr=+sse -mtriple=i686-apple-darwin8.8.0 | grep mov | count 5 +; RUN: llc < %s -mattr=+sse2 -mtriple=i686-apple-darwin | FileCheck %s -check-prefix=SSE2 +; RUN: llc < %s -mattr=+sse,-sse2 -mtriple=i686-apple-darwin | FileCheck %s -check-prefix=SSE1 +; RUN: llc < %s -mattr=-sse -mtriple=i686-apple-darwin | FileCheck %s -check-prefix=NOSSE %struct.ParmT = type { [25 x i8], i8, i8* } @.str12 = internal constant [25 x i8] c"image\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00" ; <[25 x i8]*> [#uses=1] -declare void @llvm.memcpy.i32(i8*, i8*, i32, i32) nounwind - -define void @t(i32 %argc, i8** %argv) nounwind { +define void @t1(i32 %argc, i8** %argv) nounwind { entry: +; SSE2: t1: +; SSE2: movaps _.str12, %xmm0 +; SSE2: movaps %xmm0 +; SSE2: movb $0 +; SSE2: movl $0 +; SSE2: movl $0 + +; SSE1: t1: +; SSE1: movaps _.str12, %xmm0 +; SSE1: movaps %xmm0 +; SSE1: movb $0 +; SSE1: movl $0 +; SSE1: movl $0 + +; NOSSE: t1: +; NOSSE: movb $0 +; NOSSE: movl $0 +; NOSSE: movl $0 +; NOSSE: movl $0 +; NOSSE: movl $0 +; NOSSE: movl $101 +; NOSSE: movl $1734438249 %parms.i = alloca [13 x %struct.ParmT] ; <[13 x %struct.ParmT]*> [#uses=1] %parms1.i = getelementptr [13 x %struct.ParmT]* %parms.i, i32 0, i32 0, i32 0, i32 0 ; [#uses=1] call void @llvm.memcpy.i32( i8* %parms1.i, i8* getelementptr ([25 x i8]* @.str12, i32 0, i32 0), i32 25, i32 1 ) nounwind unreachable } + +;rdar://7774704 +%struct.s0 = type { [2 x double] } + +define void @t2(%struct.s0* nocapture %a, %struct.s0* nocapture %b) nounwind ssp { +entry: +; SSE2: t2: +; SSE2: movaps (%eax), %xmm0 +; SSE2: movaps %xmm0, (%eax) + +; SSE1: t2: +; SSE1: movaps (%eax), %xmm0 +; SSE1: movaps %xmm0, (%eax) + +; NOSSE: t2: +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl + %tmp2 = bitcast %struct.s0* %a to i8* ; [#uses=1] + %tmp3 = bitcast %struct.s0* %b to i8* ; [#uses=1] + tail call void @llvm.memcpy.i32(i8* %tmp2, i8* %tmp3, i32 16, i32 16) + ret void +} + +define void @t3(%struct.s0* nocapture %a, %struct.s0* nocapture %b) nounwind ssp { +entry: +; SSE2: t3: +; SSE2: movsd (%eax), %xmm0 +; SSE2: movsd 8(%eax), %xmm1 +; SSE2: movsd %xmm1, 8(%eax) +; SSE2: movsd %xmm0, (%eax) + +; SSE1: t3: +; SSE1: movl +; SSE1: movl +; SSE1: movl +; SSE1: movl +; SSE1: movl +; SSE1: movl +; SSE1: movl +; SSE1: movl +; SSE1: movl +; SSE1: movl + +; NOSSE: t3: +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl + %tmp2 = bitcast %struct.s0* %a to i8* ; [#uses=1] + %tmp3 = bitcast %struct.s0* %b to i8* ; [#uses=1] + tail call void @llvm.memcpy.i32(i8* %tmp2, i8* %tmp3, i32 16, i32 8) + ret void +} + +declare void @llvm.memcpy.i32(i8* nocapture, i8* nocapture, i32, i32) nounwind diff --git a/test/CodeGen/X86/memset-2.ll b/test/CodeGen/X86/memset-2.ll index 7deb52f8078..e2eba76da8a 100644 --- a/test/CodeGen/X86/memset-2.ll +++ b/test/CodeGen/X86/memset-2.ll @@ -1,47 +1,13 @@ -; RUN: llc < %s | not grep rep -; RUN: llc < %s | grep memset +; RUN: llc < %s | FileCheck %s target triple = "i386" declare void @llvm.memset.i32(i8*, i8, i32, i32) nounwind -define fastcc i32 @cli_scanzip(i32 %desc) nounwind { +define fastcc void @t() nounwind { entry: - br label %bb8.i.i.i.i - -bb8.i.i.i.i: ; preds = %bb8.i.i.i.i, %entry - icmp eq i32 0, 0 ; :0 [#uses=1] - br i1 %0, label %bb61.i.i.i, label %bb8.i.i.i.i - -bb32.i.i.i: ; preds = %bb61.i.i.i - ptrtoint i8* %tail.0.i.i.i to i32 ; :1 [#uses=1] - sub i32 0, %1 ; :2 [#uses=1] - icmp sgt i32 %2, 19 ; :3 [#uses=1] - br i1 %3, label %bb34.i.i.i, label %bb61.i.i.i - -bb34.i.i.i: ; preds = %bb32.i.i.i - load i32* null, align 4 ; :4 [#uses=1] - icmp eq i32 %4, 101010256 ; :5 [#uses=1] - br i1 %5, label %bb8.i11.i.i.i, label %bb61.i.i.i - -bb8.i11.i.i.i: ; preds = %bb8.i11.i.i.i, %bb34.i.i.i - icmp eq i32 0, 0 ; :6 [#uses=1] - br i1 %6, label %cli_dbgmsg.exit49.i, label %bb8.i11.i.i.i - -cli_dbgmsg.exit49.i: ; preds = %bb8.i11.i.i.i - icmp eq [32768 x i8]* null, null ; :7 [#uses=1] - br i1 %7, label %bb1.i28.i, label %bb8.i.i - -bb61.i.i.i: ; preds = %bb61.i.i.i, %bb34.i.i.i, %bb32.i.i.i, %bb8.i.i.i.i - %tail.0.i.i.i = getelementptr [1024 x i8]* null, i32 0, i32 0 ; [#uses=2] - load i8* %tail.0.i.i.i, align 1 ; :8 [#uses=1] - icmp eq i8 %8, 80 ; :9 [#uses=1] - br i1 %9, label %bb32.i.i.i, label %bb61.i.i.i - -bb1.i28.i: ; preds = %cli_dbgmsg.exit49.i - call void @llvm.memset.i32( i8* null, i8 0, i32 88, i32 1 ) nounwind - unreachable - -bb8.i.i: ; preds = %bb8.i.i, %cli_dbgmsg.exit49.i - br label %bb8.i.i +; CHECK: t: +; CHECK: call memset + call void @llvm.memset.i32( i8* null, i8 0, i32 188, i32 1 ) nounwind + unreachable } diff --git a/test/CodeGen/X86/memset64-on-x86-32.ll b/test/CodeGen/X86/memset64-on-x86-32.ll index da8fc51da8e..8b817b49eec 100644 --- a/test/CodeGen/X86/memset64-on-x86-32.ll +++ b/test/CodeGen/X86/memset64-on-x86-32.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=i386-apple-darwin | grep stosl +; RUN: llc < %s -mtriple=i386-apple-darwin | grep movl | count 20 ; RUN: llc < %s -mtriple=x86_64-apple-darwin | grep movq | count 10 define void @bork() nounwind { diff --git a/test/CodeGen/X86/small-byval-memcpy.ll b/test/CodeGen/X86/small-byval-memcpy.ll index 9ec9182e5e3..711dc51e4d7 100644 --- a/test/CodeGen/X86/small-byval-memcpy.ll +++ b/test/CodeGen/X86/small-byval-memcpy.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s | not grep movs +; RUN: llc < %s | grep movsd | count 8 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" target triple = "i386-apple-darwin8" diff --git a/test/CodeGen/X86/unaligned-load.ll b/test/CodeGen/X86/unaligned-load.ll index b61803d10ce..2e9b248314c 100644 --- a/test/CodeGen/X86/unaligned-load.ll +++ b/test/CodeGen/X86/unaligned-load.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=core2 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck -check-prefix=CORE2 %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=corei7 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck -check-prefix=COREI7 %s @.str1 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 8 @.str3 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, 2'ND STRING\00", align 8 @@ -11,7 +12,11 @@ entry: bb: %String2Loc9 = getelementptr inbounds [31 x i8]* %String2Loc, i64 0, i64 0 call void @llvm.memcpy.i64(i8* %String2Loc9, i8* getelementptr inbounds ([31 x i8]* @.str3, i64 0, i64 0), i64 31, i32 1) -; CHECK: movups _.str3 +; CORE2: movsd _.str3+16 +; CORE2: movsd _.str3+8 +; CORE2: movsd _.str3 + +; COREI7: movups _.str3 br label %bb return: @@ -20,8 +25,14 @@ return: declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind -; CHECK: .align 3 -; CHECK-NEXT: _.str1: -; CHECK-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING" -; CHECK: .align 3 -; CHECK-NEXT: _.str3: +; CORE2: .align 3 +; CORE2-NEXT: _.str1: +; CORE2-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING" +; CORE2: .align 3 +; CORE2-NEXT: _.str3: + +; COREI7: .align 3 +; COREI7-NEXT: _.str1: +; COREI7-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING" +; COREI7: .align 3 +; COREI7-NEXT: _.str3: