X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FCodeGen%2FSelectionDAG%2FDAGCombiner.cpp;h=1f98fde13a89b0f54946a85f009842c99fbcb8af;hb=66c960350c9f1380ae5adbb19947e435d7b0d752;hp=64cc1f5f7cb84e1a4aa24d53894d2962341a2528;hpb=bc2572cac5d88b92c23de4911144871f2d3c4f6e;p=oota-llvm.git diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 64cc1f5f7cb..1f98fde13a8 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17,9 +17,9 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -327,6 +327,7 @@ namespace { SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp, unsigned HiOp); SDValue CombineConsecutiveLoads(SDNode *N, EVT VT); + SDValue CombineExtLoad(SDNode *N); SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT); SDValue BuildSDIV(SDNode *N); SDValue BuildSDIVPow2(SDNode *N); @@ -363,6 +364,28 @@ namespace { /// chain (aliasing node.) SDValue FindBetterChain(SDNode *N, SDValue Chain); + /// Holds a pointer to an LSBaseSDNode as well as information on where it + /// is located in a sequence of memory operations connected by a chain. + struct MemOpLink { + MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq): + MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { } + // Ptr to the mem node. + LSBaseSDNode *MemNode; + // Offset from the base ptr. + int64_t OffsetFromBase; + // What is the sequence number of this mem node. + // Lowest mem operand in the DAG starts at zero. + unsigned SequenceNum; + }; + + /// This is a helper function for MergeConsecutiveStores. When the source + /// elements of the consecutive stores are all constants or all extracted + /// vector elements, try to merge them into one larger store. + /// \return True if a merged store was created. + bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl &StoreNodes, + EVT MemVT, unsigned NumElem, + bool IsConstantSrc, bool UseVector); + /// Merge consecutive store operations into a wide store. /// This optimization uses wide integers or vectors when possible. /// \return True if some memory operations were changed. @@ -380,12 +403,9 @@ namespace { DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL) : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes), OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) { - AttributeSet FnAttrs = - DAG.getMachineFunction().getFunction()->getAttributes(); - ForCodeSize = - FnAttrs.hasAttribute(AttributeSet::FunctionIndex, - Attribute::OptimizeForSize) || - FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); + auto *F = DAG.getMachineFunction().getFunction(); + ForCodeSize = F->hasFnAttribute(Attribute::OptimizeForSize) || + F->hasFnAttribute(Attribute::MinSize); } /// Runs the dag combiner on all nodes in the work list @@ -446,7 +466,7 @@ void TargetLowering::DAGCombinerInfo::RemoveFromWorklist(SDNode *N) { } SDValue TargetLowering::DAGCombinerInfo:: -CombineTo(SDNode *N, const std::vector &To, bool AddTo) { +CombineTo(SDNode *N, ArrayRef To, bool AddTo) { return ((DAGCombiner*)DC)->CombineTo(N, &To[0], To.size(), AddTo); } @@ -738,10 +758,9 @@ SDValue DAGCombiner::ReassociateOps(unsigned Opc, SDLoc DL, if (SDNode *L = isConstantBuildVectorOrConstantInt(N0.getOperand(1))) { if (SDNode *R = isConstantBuildVectorOrConstantInt(N1)) { // reassoc. (op (op x, c1), c2) -> (op x, (op c1, c2)) - SDValue OpNode = DAG.FoldConstantArithmetic(Opc, VT, L, R); - if (!OpNode.getNode()) - return SDValue(); - return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode); + if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, VT, L, R)) + return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode); + return SDValue(); } if (N0.hasOneUse()) { // reassoc. (op (op x, c1), y) -> (op (op x, y), c1) iff x+c1 has one @@ -759,10 +778,9 @@ SDValue DAGCombiner::ReassociateOps(unsigned Opc, SDLoc DL, if (SDNode *R = isConstantBuildVectorOrConstantInt(N1.getOperand(1))) { if (SDNode *L = isConstantBuildVectorOrConstantInt(N0)) { // reassoc. (op c2, (op x, c1)) -> (op x, (op c1, c2)) - SDValue OpNode = DAG.FoldConstantArithmetic(Opc, VT, R, L); - if (!OpNode.getNode()) - return SDValue(); - return DAG.getNode(Opc, DL, VT, N1.getOperand(0), OpNode); + if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, VT, R, L)) + return DAG.getNode(Opc, DL, VT, N1.getOperand(0), OpNode); + return SDValue(); } if (N1.hasOneUse()) { // reassoc. (op y, (op x, c1)) -> (op (op x, y), c1) iff x+c1 has one @@ -877,8 +895,8 @@ SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) { if (LoadSDNode *LD = dyn_cast(Op)) { EVT MemVT = LD->getMemoryVT(); ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) - ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT) ? ISD::ZEXTLOAD - : ISD::EXTLOAD) + ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, PVT, MemVT) ? ISD::ZEXTLOAD + : ISD::EXTLOAD) : LD->getExtensionType(); Replace = true; return DAG.getExtLoad(ExtType, dl, PVT, @@ -1099,8 +1117,8 @@ bool DAGCombiner::PromoteLoad(SDValue Op) { LoadSDNode *LD = cast(N); EVT MemVT = LD->getMemoryVT(); ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) - ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT) ? ISD::ZEXTLOAD - : ISD::EXTLOAD) + ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, PVT, MemVT) ? ISD::ZEXTLOAD + : ISD::EXTLOAD) : LD->getExtensionType(); SDValue NewLD = DAG.getExtLoad(ExtType, dl, PVT, LD->getChain(), LD->getBasePtr(), @@ -1163,10 +1181,8 @@ void DAGCombiner::Run(CombineLevel AtLevel) { LegalTypes = Level >= AfterLegalizeTypes; // Early exit if this basic block is in an optnone function. - AttributeSet FnAttrs = - DAG.getMachineFunction().getFunction()->getAttributes(); - if (FnAttrs.hasAttribute(AttributeSet::FunctionIndex, - Attribute::OptimizeNone)) + if (DAG.getMachineFunction().getFunction()->hasFnAttribute( + Attribute::OptimizeNone)) return; // Add all the dag nodes to the worklist. @@ -1480,7 +1496,7 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { switch (Op.getOpcode()) { case ISD::EntryToken: // Entry tokens don't need to be added to the list. They are - // rededundant. + // redundant. Changed = true; break; @@ -1509,7 +1525,7 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { SDValue Result; - // If we've change things around then replace token factor. + // If we've changed things around then replace token factor. if (Changed) { if (Ops.empty()) { // The entry token is the only possible outcome. @@ -1519,8 +1535,11 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops); } - // Don't add users to work list. - return CombineTo(N, Result, false); + // Add users to worklist if AA is enabled, since it may introduce + // a lot of new chained token factors while removing memory deps. + bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA + : DAG.getSubtarget().useAA(); + return CombineTo(N, Result, UseAA /*add to worklist*/); } return Result; @@ -2800,6 +2819,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) { // actually legal and isn't going to get expanded, else this is a false // optimisation. bool CanZextLoadProfitably = TLI.isLoadExtLegal(ISD::ZEXTLOAD, + Load->getValueType(0), Load->getMemoryVT()); // Resize the constant to the same size as the original memory access before @@ -2926,7 +2946,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) { if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth, BitWidth - MemVT.getScalarType().getSizeInBits())) && ((!LegalOperations && !LN0->isVolatile()) || - TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT))) { + TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) { SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT, LN0->getChain(), LN0->getBasePtr(), MemVT, LN0->getMemOperand()); @@ -2946,7 +2966,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) { if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth, BitWidth - MemVT.getScalarType().getSizeInBits())) && ((!LegalOperations && !LN0->isVolatile()) || - TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT))) { + TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) { SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT, LN0->getChain(), LN0->getBasePtr(), MemVT, LN0->getMemOperand()); @@ -2972,10 +2992,11 @@ SDValue DAGCombiner::visitAND(SDNode *N) { if (ActiveBits > 0 && APIntOps::isMask(ActiveBits, N1C->getAPIntValue())){ EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); EVT LoadedVT = LN0->getMemoryVT(); + EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT; if (ExtVT == LoadedVT && - (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, ExtVT))) { - EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT; + (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, + ExtVT))) { SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy, @@ -2990,7 +3011,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) { // Do not generate loads of non-round integer types since these can // be expensive (and would be wrong if the type is not byte sized). if (!LN0->isVolatile() && LoadedVT.bitsGT(ExtVT) && ExtVT.isRound() && - (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, ExtVT))) { + (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, + ExtVT))) { EVT PtrType = LN0->getOperand(1).getValueType(); unsigned Alignment = LN0->getAlignment(); @@ -3010,7 +3032,6 @@ SDValue DAGCombiner::visitAND(SDNode *N) { AddToWorklist(NewPtr.getNode()); - EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT; SDValue Load = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy, LN0->getChain(), NewPtr, @@ -3447,12 +3468,11 @@ SDValue DAGCombiner::visitOR(SDNode *N) { isa(N0.getOperand(1))) { ConstantSDNode *C1 = cast(N0.getOperand(1)); if ((C1->getAPIntValue() & N1C->getAPIntValue()) != 0) { - SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, VT, N1C, C1); - if (!COR.getNode()) - return SDValue(); - return DAG.getNode(ISD::AND, SDLoc(N), VT, - DAG.getNode(ISD::OR, SDLoc(N0), VT, - N0.getOperand(0), N1), COR); + if (SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, VT, N1C, C1)) + return DAG.getNode( + ISD::AND, SDLoc(N), VT, + DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1), COR); + return SDValue(); } } // fold (or (setcc x), (setcc y)) -> (setcc (or x, y)) @@ -3528,6 +3548,17 @@ SDValue DAGCombiner::visitOR(SDNode *N) { } } + // (or (and X, M), (and X, N)) -> (and X, (or M, N)) + if (N0.getOpcode() == ISD::AND && + N1.getOpcode() == ISD::AND && + N0.getOperand(0) == N1.getOperand(0) && + // Don't increase # computations. + (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) { + SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT, + N0.getOperand(1), N1.getOperand(1)); + return DAG.getNode(ISD::AND, SDLoc(N), VT, N0.getOperand(0), X); + } + // See if this is some rotate idiom. if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N))) return SDValue(Rot, 0); @@ -4068,8 +4099,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { if (N01CV && N01CV->isConstant() && N00.getOpcode() == ISD::SETCC && TLI.getBooleanContents(N00.getOperand(0).getValueType()) == TargetLowering::ZeroOrNegativeOneBooleanContent) { - SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, VT, N01CV, N1CV); - if (C.getNode()) + if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, VT, N01CV, N1CV)) return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, C); } } else { @@ -4615,6 +4645,43 @@ SDValue DAGCombiner::visitCTPOP(SDNode *N) { return SDValue(); } + +/// \brief Generate Min/Max node +static SDValue combineMinNumMaxNum(SDLoc DL, EVT VT, SDValue LHS, SDValue RHS, + SDValue True, SDValue False, + ISD::CondCode CC, const TargetLowering &TLI, + SelectionDAG &DAG) { + if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) + return SDValue(); + + switch (CC) { + case ISD::SETOLT: + case ISD::SETOLE: + case ISD::SETLT: + case ISD::SETLE: + case ISD::SETULT: + case ISD::SETULE: { + unsigned Opcode = (LHS == True) ? ISD::FMINNUM : ISD::FMAXNUM; + if (TLI.isOperationLegal(Opcode, VT)) + return DAG.getNode(Opcode, DL, VT, LHS, RHS); + return SDValue(); + } + case ISD::SETOGT: + case ISD::SETOGE: + case ISD::SETGT: + case ISD::SETGE: + case ISD::SETUGT: + case ISD::SETUGE: { + unsigned Opcode = (LHS == True) ? ISD::FMAXNUM : ISD::FMINNUM; + if (TLI.isOperationLegal(Opcode, VT)) + return DAG.getNode(Opcode, DL, VT, LHS, RHS); + return SDValue(); + } + default: + return SDValue(); + } +} + SDValue DAGCombiner::visitSELECT(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -4694,6 +4761,28 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { // fold selects based on a setcc into other things, such as min/max/abs if (N0.getOpcode() == ISD::SETCC) { + // select x, y (fcmp lt x, y) -> fminnum x, y + // select x, y (fcmp gt x, y) -> fmaxnum x, y + // + // This is OK if we don't care about what happens if either operand is a + // NaN. + // + + // FIXME: Instead of testing for UnsafeFPMath, this should be checking for + // no signed zeros as well as no nans. + const TargetOptions &Options = DAG.getTarget().Options; + if (Options.UnsafeFPMath && + VT.isFloatingPoint() && N0.hasOneUse() && + DAG.isKnownNeverNaN(N1) && DAG.isKnownNeverNaN(N2)) { + ISD::CondCode CC = cast(N0.getOperand(2))->get(); + + SDValue FMinMax = + combineMinNumMaxNum(SDLoc(N), VT, N0.getOperand(0), N0.getOperand(1), + N1, N2, CC, TLI, DAG); + if (FMinMax) + return FMinMax; + } + if ((!LegalOperations && TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT)) || TLI.isOperationLegal(ISD::SELECT_CC, VT)) @@ -4785,7 +4874,7 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) { MaskedStoreSDNode *MST = dyn_cast(N); SDValue Mask = MST->getMask(); - SDValue Data = MST->getData(); + SDValue Data = MST->getValue(); SDLoc DL(N); // If the MSTORE data type requires splitting and the mask is provided by a @@ -4828,7 +4917,8 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) { MachineMemOperand::MOStore, LoMemVT.getStoreSize(), Alignment, MST->getAAInfo(), MST->getRanges()); - Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, MMO); + Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO, + MST->isTruncatingStore()); unsigned IncrementSize = LoMemVT.getSizeInBits()/8; Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, @@ -4840,7 +4930,8 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) { SecondHalfAlignment, MST->getAAInfo(), MST->getRanges()); - Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, MMO); + Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO, + MST->isTruncatingStore()); AddToWorklist(Lo.getNode()); AddToWorklist(Hi.getNode()); @@ -4901,7 +4992,8 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) { MachineMemOperand::MOLoad, LoMemVT.getStoreSize(), Alignment, MLD->getAAInfo(), MLD->getRanges()); - Lo = DAG.getMaskedLoad(LoVT, DL, Chain, Ptr, MaskLo, Src0Lo, MMO); + Lo = DAG.getMaskedLoad(LoVT, DL, Chain, Ptr, MaskLo, Src0Lo, LoMemVT, MMO, + ISD::NON_EXTLOAD); unsigned IncrementSize = LoMemVT.getSizeInBits()/8; Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, @@ -4912,7 +5004,8 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) { MachineMemOperand::MOLoad, HiMemVT.getStoreSize(), SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges()); - Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, Src0Hi, MMO); + Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, Src0Hi, HiMemVT, MMO, + ISD::NON_EXTLOAD); AddToWorklist(Lo.getNode()); AddToWorklist(Hi.getNode()); @@ -5043,13 +5136,16 @@ SDValue DAGCombiner::visitSELECT_CC(SDNode *N) { return N2; // cond always true -> true val else return N3; // cond always false -> false val - } - - // Fold to a simpler select_cc - if (SCC.getOpcode() == ISD::SETCC) + } else if (SCC->getOpcode() == ISD::UNDEF) { + // When the condition is UNDEF, just return the first operand. This is + // coherent the DAG creation, no setcc node is created in this case + return N2; + } else if (SCC.getOpcode() == ISD::SETCC) { + // Fold to a simpler select_cc return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N2.getValueType(), SCC.getOperand(0), SCC.getOperand(1), N2, N3, SCC.getOperand(2)); + } } // If we can fold this based on the true/false value, do so. @@ -5210,6 +5306,102 @@ void DAGCombiner::ExtendSetCCUses(const SmallVectorImpl &SetCCs, } } +// FIXME: Bring more similar combines here, common to sext/zext (maybe aext?). +SDValue DAGCombiner::CombineExtLoad(SDNode *N) { + SDValue N0 = N->getOperand(0); + EVT DstVT = N->getValueType(0); + EVT SrcVT = N0.getValueType(); + + assert((N->getOpcode() == ISD::SIGN_EXTEND || + N->getOpcode() == ISD::ZERO_EXTEND) && + "Unexpected node type (not an extend)!"); + + // fold (sext (load x)) to multiple smaller sextloads; same for zext. + // For example, on a target with legal v4i32, but illegal v8i32, turn: + // (v8i32 (sext (v8i16 (load x)))) + // into: + // (v8i32 (concat_vectors (v4i32 (sextload x)), + // (v4i32 (sextload (x + 16))))) + // Where uses of the original load, i.e.: + // (v8i16 (load x)) + // are replaced with: + // (v8i16 (truncate + // (v8i32 (concat_vectors (v4i32 (sextload x)), + // (v4i32 (sextload (x + 16))))))) + // + // This combine is only applicable to illegal, but splittable, vectors. + // All legal types, and illegal non-vector types, are handled elsewhere. + // This combine is controlled by TargetLowering::isVectorLoadExtDesirable. + // + if (N0->getOpcode() != ISD::LOAD) + return SDValue(); + + LoadSDNode *LN0 = cast(N0); + + if (!ISD::isNON_EXTLoad(LN0) || !ISD::isUNINDEXEDLoad(LN0) || + !N0.hasOneUse() || LN0->isVolatile() || !DstVT.isVector() || + !DstVT.isPow2VectorType() || !TLI.isVectorLoadExtDesirable(SDValue(N, 0))) + return SDValue(); + + SmallVector SetCCs; + if (!ExtendUsesToFormExtLoad(N, N0, N->getOpcode(), SetCCs, TLI)) + return SDValue(); + + ISD::LoadExtType ExtType = + N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD; + + // Try to split the vector types to get down to legal types. + EVT SplitSrcVT = SrcVT; + EVT SplitDstVT = DstVT; + while (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT) && + SplitSrcVT.getVectorNumElements() > 1) { + SplitDstVT = DAG.GetSplitDestVTs(SplitDstVT).first; + SplitSrcVT = DAG.GetSplitDestVTs(SplitSrcVT).first; + } + + if (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT)) + return SDValue(); + + SDLoc DL(N); + const unsigned NumSplits = + DstVT.getVectorNumElements() / SplitDstVT.getVectorNumElements(); + const unsigned Stride = SplitSrcVT.getStoreSize(); + SmallVector Loads; + SmallVector Chains; + + SDValue BasePtr = LN0->getBasePtr(); + for (unsigned Idx = 0; Idx < NumSplits; Idx++) { + const unsigned Offset = Idx * Stride; + const unsigned Align = MinAlign(LN0->getAlignment(), Offset); + + SDValue SplitLoad = DAG.getExtLoad( + ExtType, DL, SplitDstVT, LN0->getChain(), BasePtr, + LN0->getPointerInfo().getWithOffset(Offset), SplitSrcVT, + LN0->isVolatile(), LN0->isNonTemporal(), LN0->isInvariant(), + Align, LN0->getAAInfo()); + + BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, + DAG.getConstant(Stride, BasePtr.getValueType())); + + Loads.push_back(SplitLoad.getValue(0)); + Chains.push_back(SplitLoad.getValue(1)); + } + + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads); + + CombineTo(N, NewValue); + + // Replace uses of the original load (before extension) + // with a truncate of the concatenated sextloaded vectors. + SDValue Trunc = + DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), NewValue); + CombineTo(N0.getNode(), Trunc, NewChain); + ExtendSetCCUses(SetCCs, Trunc, NewValue, DL, + (ISD::NodeType)N->getOpcode()); + return SDValue(N, 0); // Return N so it doesn't get rechecked! +} + SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); @@ -5276,17 +5468,18 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { } // fold (sext (load x)) -> (sext (truncate (sextload x))) - // None of the supported targets knows how to perform load and sign extend - // on vectors in one instruction. We only perform this transformation on - // scalars. - if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() && - ISD::isUNINDEXEDLoad(N0.getNode()) && - ((!LegalOperations && !cast(N0)->isVolatile()) || - TLI.isLoadExtLegal(ISD::SEXTLOAD, N0.getValueType()))) { + // Only generate vector extloads when 1) they're legal, and 2) they are + // deemed desirable by the target. + if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && + ((!LegalOperations && !VT.isVector() && + !cast(N0)->isVolatile()) || + TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, N0.getValueType()))) { bool DoXform = true; SmallVector SetCCs; if (!N0.hasOneUse()) DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::SIGN_EXTEND, SetCCs, TLI); + if (VT.isVector()) + DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0)); if (DoXform) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, @@ -5303,6 +5496,11 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { } } + // fold (sext (load x)) to multiple smaller sextloads. + // Only on illegal but splittable vectors. + if (SDValue ExtLoad = CombineExtLoad(N)) + return ExtLoad; + // fold (sext (sextload x)) -> (sext (truncate (sextload x))) // fold (sext ( extload x)) -> (sext (truncate (sextload x))) if ((ISD::isSEXTLoad(N0.getNode()) || ISD::isEXTLoad(N0.getNode())) && @@ -5310,7 +5508,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { LoadSDNode *LN0 = cast(N0); EVT MemVT = LN0->getMemoryVT(); if ((!LegalOperations && !LN0->isVolatile()) || - TLI.isLoadExtLegal(ISD::SEXTLOAD, MemVT)) { + TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT)) { SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), MemVT, @@ -5330,7 +5528,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { N0.getOpcode() == ISD::XOR) && isa(N0.getOperand(0)) && N0.getOperand(1).getOpcode() == ISD::Constant && - TLI.isLoadExtLegal(ISD::SEXTLOAD, N0.getValueType()) && + TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, N0.getValueType()) && (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) { LoadSDNode *LN0 = cast(N0.getOperand(0)); if (LN0->getExtensionType() != ISD::ZEXTLOAD && LN0->isUnindexed()) { @@ -5566,17 +5764,18 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { } // fold (zext (load x)) -> (zext (truncate (zextload x))) - // None of the supported targets knows how to perform load and vector_zext - // on vectors in one instruction. We only perform this transformation on - // scalars. - if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() && - ISD::isUNINDEXEDLoad(N0.getNode()) && - ((!LegalOperations && !cast(N0)->isVolatile()) || - TLI.isLoadExtLegal(ISD::ZEXTLOAD, N0.getValueType()))) { + // Only generate vector extloads when 1) they're legal, and 2) they are + // deemed desirable by the target. + if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && + ((!LegalOperations && !VT.isVector() && + !cast(N0)->isVolatile()) || + TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, N0.getValueType()))) { bool DoXform = true; SmallVector SetCCs; if (!N0.hasOneUse()) DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ZERO_EXTEND, SetCCs, TLI); + if (VT.isVector()) + DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0)); if (DoXform) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, @@ -5594,13 +5793,18 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { } } + // fold (zext (load x)) to multiple smaller zextloads. + // Only on illegal but splittable vectors. + if (SDValue ExtLoad = CombineExtLoad(N)) + return ExtLoad; + // fold (zext (and/or/xor (load x), cst)) -> // (and/or/xor (zextload x), (zext cst)) if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::XOR) && isa(N0.getOperand(0)) && N0.getOperand(1).getOpcode() == ISD::Constant && - TLI.isLoadExtLegal(ISD::ZEXTLOAD, N0.getValueType()) && + TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, N0.getValueType()) && (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) { LoadSDNode *LN0 = cast(N0.getOperand(0)); if (LN0->getExtensionType() != ISD::SEXTLOAD && LN0->isUnindexed()) { @@ -5637,7 +5841,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { LoadSDNode *LN0 = cast(N0); EVT MemVT = LN0->getMemoryVT(); if ((!LegalOperations && !LN0->isVolatile()) || - TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT)) { + TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT)) { SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), MemVT, @@ -5799,7 +6003,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { // scalars. if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() && ISD::isUNINDEXEDLoad(N0.getNode()) && - TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType())) { + TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) { bool DoXform = true; SmallVector SetCCs; if (!N0.hasOneUse()) @@ -5829,7 +6033,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { LoadSDNode *LN0 = cast(N0); ISD::LoadExtType ExtType = LN0->getExtensionType(); EVT MemVT = LN0->getMemoryVT(); - if (!LegalOperations || TLI.isLoadExtLegal(ExtType, MemVT)) { + if (!LegalOperations || TLI.isLoadExtLegal(ExtType, VT, MemVT)) { SDValue ExtLoad = DAG.getExtLoad(ExtType, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), MemVT, LN0->getMemOperand()); @@ -5958,7 +6162,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { ExtVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits() - N01->getZExtValue()); } - if (LegalOperations && !TLI.isLoadExtLegal(ExtType, ExtVT)) + if (LegalOperations && !TLI.isLoadExtLegal(ExtType, VT, ExtVT)) return SDValue(); unsigned EVTBits = ExtVT.getSizeInBits(); @@ -6165,7 +6369,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { ISD::isUNINDEXEDLoad(N0.getNode()) && EVT == cast(N0)->getMemoryVT() && ((!LegalOperations && !cast(N0)->isVolatile()) || - TLI.isLoadExtLegal(ISD::SEXTLOAD, EVT))) { + TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, LN0->getChain(), @@ -6181,7 +6385,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { N0.hasOneUse() && EVT == cast(N0)->getMemoryVT() && ((!LegalOperations && !cast(N0)->isVolatile()) || - TLI.isLoadExtLegal(ISD::SEXTLOAD, EVT))) { + TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, LN0->getChain(), @@ -6484,19 +6688,15 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { // If the input is a constant, let getNode fold it. if (isa(N0) || isa(N0)) { - SDValue Res = DAG.getNode(ISD::BITCAST, SDLoc(N), VT, N0); - if (Res.getNode() != N) { - if (!LegalOperations || - TLI.isOperationLegal(Res.getNode()->getOpcode(), VT)) - return Res; - - // Folding it resulted in an illegal node, and it's too late to - // do that. Clean up the old node and forego the transformation. - // Ideally this won't happen very often, because instcombine - // and the earlier dagcombine runs (where illegal nodes are - // permitted) should have folded most of them already. - deleteAndRecombine(Res.getNode()); - } + // If we can't allow illegal operations, we need to check that this is just + // a fp -> int or int -> conversion and that the resulting operation will + // be legal. + if (!LegalOperations || + (isa(N0) && VT.isFloatingPoint() && !VT.isVector() && + TLI.isOperationLegal(ISD::ConstantFP, VT)) || + (isa(N0) && VT.isInteger() && !VT.isVector() && + TLI.isOperationLegal(ISD::Constant, VT))) + return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, N0); } // (conv (conv x, t1), t2) -> (conv x, t2) @@ -6713,8 +6913,7 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) { for (unsigned i = 0, e = BV->getNumOperands(); i != e; ++i) { if (BV->getOperand(i).getOpcode() == ISD::UNDEF) { - for (unsigned j = 0; j != NumOutputsPerInput; ++j) - Ops.push_back(DAG.getUNDEF(DstEltVT)); + Ops.append(NumOutputsPerInput, DAG.getUNDEF(DstEltVT)); continue; } @@ -6739,6 +6938,133 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) { return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BV), VT, Ops); } +// Attempt different variants of (fadd (fmul a, b), c) -> fma or fmad +static SDValue performFaddFmulCombines(unsigned FusedOpcode, + bool Aggressive, + SDNode *N, + const TargetLowering &TLI, + SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + + // fold (fadd (fmul x, y), z) -> (fma x, y, z) + if (N0.getOpcode() == ISD::FMUL && + (Aggressive || N0->hasOneUse())) { + return DAG.getNode(FusedOpcode, SDLoc(N), VT, + N0.getOperand(0), N0.getOperand(1), N1); + } + + // fold (fadd x, (fmul y, z)) -> (fma y, z, x) + // Note: Commutes FADD operands. + if (N1.getOpcode() == ISD::FMUL && + (Aggressive || N1->hasOneUse())) { + return DAG.getNode(FusedOpcode, SDLoc(N), VT, + N1.getOperand(0), N1.getOperand(1), N0); + } + + // More folding opportunities when target permits. + if (Aggressive) { + // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z)) + if (N0.getOpcode() == ISD::FMA && + N0.getOperand(2).getOpcode() == ISD::FMUL) { + return DAG.getNode(FusedOpcode, SDLoc(N), VT, + N0.getOperand(0), N0.getOperand(1), + DAG.getNode(FusedOpcode, SDLoc(N), VT, + N0.getOperand(2).getOperand(0), + N0.getOperand(2).getOperand(1), + N1)); + } + + // fold (fadd x, (fma y, z, (fmul u, v)) -> (fma y, z (fma u, v, x)) + if (N1->getOpcode() == ISD::FMA && + N1.getOperand(2).getOpcode() == ISD::FMUL) { + return DAG.getNode(FusedOpcode, SDLoc(N), VT, + N1.getOperand(0), N1.getOperand(1), + DAG.getNode(FusedOpcode, SDLoc(N), VT, + N1.getOperand(2).getOperand(0), + N1.getOperand(2).getOperand(1), + N0)); + } + } + + return SDValue(); +} + +static SDValue performFsubFmulCombines(unsigned FusedOpcode, + bool Aggressive, + SDNode *N, + const TargetLowering &TLI, + SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + + SDLoc SL(N); + + // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) + if (N0.getOpcode() == ISD::FMUL && + (Aggressive || N0->hasOneUse())) { + return DAG.getNode(FusedOpcode, SL, VT, + N0.getOperand(0), N0.getOperand(1), + DAG.getNode(ISD::FNEG, SL, VT, N1)); + } + + // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) + // Note: Commutes FSUB operands. + if (N1.getOpcode() == ISD::FMUL && + (Aggressive || N1->hasOneUse())) + return DAG.getNode(FusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, + N1.getOperand(0)), + N1.getOperand(1), N0); + + // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z)) + if (N0.getOpcode() == ISD::FNEG && + N0.getOperand(0).getOpcode() == ISD::FMUL && + (Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) { + SDValue N00 = N0.getOperand(0).getOperand(0); + SDValue N01 = N0.getOperand(0).getOperand(1); + return DAG.getNode(FusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, N00), N01, + DAG.getNode(ISD::FNEG, SL, VT, N1)); + } + + // More folding opportunities when target permits. + if (Aggressive) { + // fold (fsub (fma x, y, (fmul u, v)), z) + // -> (fma x, y (fma u, v, (fneg z))) + if (N0.getOpcode() == FusedOpcode && + N0.getOperand(2).getOpcode() == ISD::FMUL) { + return DAG.getNode(FusedOpcode, SDLoc(N), VT, + N0.getOperand(0), N0.getOperand(1), + DAG.getNode(FusedOpcode, SDLoc(N), VT, + N0.getOperand(2).getOperand(0), + N0.getOperand(2).getOperand(1), + DAG.getNode(ISD::FNEG, SDLoc(N), VT, + N1))); + } + + // fold (fsub x, (fma y, z, (fmul u, v))) + // -> (fma (fneg y), z, (fma (fneg u), v, x)) + if (N1.getOpcode() == FusedOpcode && + N1.getOperand(2).getOpcode() == ISD::FMUL) { + SDValue N20 = N1.getOperand(2).getOperand(0); + SDValue N21 = N1.getOperand(2).getOperand(1); + return DAG.getNode(FusedOpcode, SDLoc(N), VT, + DAG.getNode(ISD::FNEG, SDLoc(N), VT, + N1.getOperand(0)), + N1.getOperand(1), + DAG.getNode(FusedOpcode, SDLoc(N), VT, + DAG.getNode(ISD::FNEG, SDLoc(N), VT, + N20), + N21, N0)); + } + } + + return SDValue(); +} + SDValue DAGCombiner::visitFADD(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -6878,24 +7204,55 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { } } // enable-unsafe-fp-math + if (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)) { + // Assume if there is an fmad instruction that it should be aggressively + // used. + if (SDValue Fused = performFaddFmulCombines(ISD::FMAD, true, N, TLI, DAG)) + return Fused; + } // FADD -> FMA combines: if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) && TLI.isFMAFasterThanFMulAndFAdd(VT) && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT))) { - // fold (fadd (fmul x, y), z) -> (fma x, y, z) - if (N0.getOpcode() == ISD::FMUL && - (N0->hasOneUse() || TLI.enableAggressiveFMAFusion(VT))) - return DAG.getNode(ISD::FMA, SDLoc(N), VT, - N0.getOperand(0), N0.getOperand(1), N1); + if (!TLI.isOperationLegal(ISD::FMAD, VT)) { + // Don't form FMA if we are preferring FMAD. + if (SDValue Fused + = performFaddFmulCombines(ISD::FMA, + TLI.enableAggressiveFMAFusion(VT), + N, TLI, DAG)) { + return Fused; + } + } + + // When FP_EXTEND nodes are free on the target, and there is an opportunity + // to combine into FMA, arrange such nodes accordingly. + if (TLI.isFPExtFree(VT)) { - // fold (fadd x, (fmul y, z)) -> (fma y, z, x) - // Note: Commutes FADD operands. - if (N1.getOpcode() == ISD::FMUL && - (N1->hasOneUse() || TLI.enableAggressiveFMAFusion(VT))) - return DAG.getNode(ISD::FMA, SDLoc(N), VT, - N1.getOperand(0), N1.getOperand(1), N0); + // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z) + if (N0.getOpcode() == ISD::FP_EXTEND) { + SDValue N00 = N0.getOperand(0); + if (N00.getOpcode() == ISD::FMUL) + return DAG.getNode(ISD::FMA, SDLoc(N), VT, + DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, + N00.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, + N00.getOperand(1)), N1); + } + + // fold (fadd x, (fpext (fmul y, z)), z) -> (fma (fpext y), (fpext z), x) + // Note: Commutes FADD operands. + if (N1.getOpcode() == ISD::FP_EXTEND) { + SDValue N10 = N1.getOperand(0); + if (N10.getOpcode() == ISD::FMUL) + return DAG.getNode(ISD::FMA, SDLoc(N), VT, + DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, + N10.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, + N10.getOperand(1)), N0); + } + } } return SDValue(); @@ -6957,37 +7314,95 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { } } + if (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)) { + // Assume if there is an fmad instruction that it should be aggressively + // used. + if (SDValue Fused = performFsubFmulCombines(ISD::FMAD, true, N, TLI, DAG)) + return Fused; + } + // FSUB -> FMA combines: if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) && TLI.isFMAFasterThanFMulAndFAdd(VT) && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT))) { - // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) - if (N0.getOpcode() == ISD::FMUL && - (N0->hasOneUse() || TLI.enableAggressiveFMAFusion(VT))) - return DAG.getNode(ISD::FMA, dl, VT, - N0.getOperand(0), N0.getOperand(1), - DAG.getNode(ISD::FNEG, dl, VT, N1)); - - // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) - // Note: Commutes FSUB operands. - if (N1.getOpcode() == ISD::FMUL && - (N1->hasOneUse() || TLI.enableAggressiveFMAFusion(VT))) - return DAG.getNode(ISD::FMA, dl, VT, - DAG.getNode(ISD::FNEG, dl, VT, - N1.getOperand(0)), - N1.getOperand(1), N0); - - // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z)) - if (N0.getOpcode() == ISD::FNEG && - N0.getOperand(0).getOpcode() == ISD::FMUL && - ((N0->hasOneUse() && N0.getOperand(0).hasOneUse()) || - TLI.enableAggressiveFMAFusion(VT))) { - SDValue N00 = N0.getOperand(0).getOperand(0); - SDValue N01 = N0.getOperand(0).getOperand(1); - return DAG.getNode(ISD::FMA, dl, VT, - DAG.getNode(ISD::FNEG, dl, VT, N00), N01, - DAG.getNode(ISD::FNEG, dl, VT, N1)); + if (!TLI.isOperationLegal(ISD::FMAD, VT)) { + // Don't form FMA if we are preferring FMAD. + + if (SDValue Fused + = performFsubFmulCombines(ISD::FMA, + TLI.enableAggressiveFMAFusion(VT), + N, TLI, DAG)) { + return Fused; + } + } + + // When FP_EXTEND nodes are free on the target, and there is an opportunity + // to combine into FMA, arrange such nodes accordingly. + if (TLI.isFPExtFree(VT)) { + // fold (fsub (fpext (fmul x, y)), z) + // -> (fma (fpext x), (fpext y), (fneg z)) + if (N0.getOpcode() == ISD::FP_EXTEND) { + SDValue N00 = N0.getOperand(0); + if (N00.getOpcode() == ISD::FMUL) + return DAG.getNode(ISD::FMA, SDLoc(N), VT, + DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, + N00.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, + N00.getOperand(1)), + DAG.getNode(ISD::FNEG, SDLoc(N), VT, N1)); + } + + // fold (fsub x, (fpext (fmul y, z))) + // -> (fma (fneg (fpext y)), (fpext z), x) + // Note: Commutes FSUB operands. + if (N1.getOpcode() == ISD::FP_EXTEND) { + SDValue N10 = N1.getOperand(0); + if (N10.getOpcode() == ISD::FMUL) + return DAG.getNode(ISD::FMA, SDLoc(N), VT, + DAG.getNode(ISD::FNEG, SDLoc(N), VT, + DAG.getNode(ISD::FP_EXTEND, SDLoc(N), + VT, N10.getOperand(0))), + DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, + N10.getOperand(1)), + N0); + } + + // fold (fsub (fpext (fneg (fmul, x, y))), z) + // -> (fma (fneg (fpext x)), (fpext y), (fneg z)) + if (N0.getOpcode() == ISD::FP_EXTEND) { + SDValue N00 = N0.getOperand(0); + if (N00.getOpcode() == ISD::FNEG) { + SDValue N000 = N00.getOperand(0); + if (N000.getOpcode() == ISD::FMUL) { + return DAG.getNode(ISD::FMA, dl, VT, + DAG.getNode(ISD::FNEG, dl, VT, + DAG.getNode(ISD::FP_EXTEND, SDLoc(N), + VT, N000.getOperand(0))), + DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, + N000.getOperand(1)), + DAG.getNode(ISD::FNEG, dl, VT, N1)); + } + } + } + + // fold (fsub (fneg (fpext (fmul, x, y))), z) + // -> (fma (fneg (fpext x)), (fpext y), (fneg z)) + if (N0.getOpcode() == ISD::FNEG) { + SDValue N00 = N0.getOperand(0); + if (N00.getOpcode() == ISD::FP_EXTEND) { + SDValue N000 = N00.getOperand(0); + if (N000.getOpcode() == ISD::FMUL) { + return DAG.getNode(ISD::FMA, dl, VT, + DAG.getNode(ISD::FNEG, dl, VT, + DAG.getNode(ISD::FP_EXTEND, SDLoc(N), + VT, N000.getOperand(0))), + DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, + N000.getOperand(1)), + DAG.getNode(ISD::FNEG, dl, VT, N1)); + } + } + } } } @@ -7325,7 +7740,8 @@ SDValue DAGCombiner::visitFREM(SDNode *N) { } SDValue DAGCombiner::visitFSQRT(SDNode *N) { - if (DAG.getTarget().Options.UnsafeFPMath) { + if (DAG.getTarget().Options.UnsafeFPMath && + !TLI.isFsqrtCheap()) { // Compute this as X * (1/sqrt(X)) = X * (X ** -0.5) if (SDValue RV = BuildRsqrtEstimate(N->getOperand(0))) { EVT VT = RV.getValueType(); @@ -7492,6 +7908,50 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) { return SDValue(); } +// Fold (fp_to_{s/u}int ({s/u}int_to_fpx)) -> zext x, sext x, trunc x, or x +static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + + if (N0.getOpcode() != ISD::UINT_TO_FP && N0.getOpcode() != ISD::SINT_TO_FP) + return SDValue(); + + SDValue Src = N0.getOperand(0); + EVT SrcVT = Src.getValueType(); + bool IsInputSigned = N0.getOpcode() == ISD::SINT_TO_FP; + bool IsOutputSigned = N->getOpcode() == ISD::FP_TO_SINT; + + // We can safely assume the conversion won't overflow the output range, + // because (for example) (uint8_t)18293.f is undefined behavior. + + // Since we can assume the conversion won't overflow, our decision as to + // whether the input will fit in the float should depend on the minimum + // of the input range and output range. + + // This means this is also safe for a signed input and unsigned output, since + // a negative input would lead to undefined behavior. + unsigned InputSize = (int)SrcVT.getScalarSizeInBits() - IsInputSigned; + unsigned OutputSize = (int)VT.getScalarSizeInBits() - IsOutputSigned; + unsigned ActualSize = std::min(InputSize, OutputSize); + const fltSemantics &sem = DAG.EVTToAPFloatSemantics(N0.getValueType()); + + // We can only fold away the float conversion if the input range can be + // represented exactly in the float range. + if (APFloat::semanticsPrecision(sem) >= ActualSize) { + if (VT.getScalarSizeInBits() > SrcVT.getScalarSizeInBits()) { + unsigned ExtOp = IsInputSigned && IsOutputSigned ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND; + return DAG.getNode(ExtOp, SDLoc(N), VT, Src); + } + if (VT.getScalarSizeInBits() < SrcVT.getScalarSizeInBits()) + return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Src); + if (SrcVT == VT) + return Src; + return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Src); + } + return SDValue(); +} + SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) { SDValue N0 = N->getOperand(0); ConstantFPSDNode *N0CFP = dyn_cast(N0); @@ -7501,7 +7961,7 @@ SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) { if (N0CFP) return DAG.getNode(ISD::FP_TO_SINT, SDLoc(N), VT, N0); - return SDValue(); + return FoldIntToFPToInt(N, DAG); } SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) { @@ -7513,7 +7973,7 @@ SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) { if (N0CFP) return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), VT, N0); - return SDValue(); + return FoldIntToFPToInt(N, DAG); } SDValue DAGCombiner::visitFP_ROUND(SDNode *N) { @@ -7532,11 +7992,16 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) { // fold (fp_round (fp_round x)) -> (fp_round x) if (N0.getOpcode() == ISD::FP_ROUND) { - // This is a value preserving truncation if both round's are. - bool IsTrunc = N->getConstantOperandVal(1) == 1 && - N0.getNode()->getConstantOperandVal(1) == 1; - return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, N0.getOperand(0), - DAG.getIntPtrConstant(IsTrunc)); + const bool NIsTrunc = N->getConstantOperandVal(1) == 1; + const bool N0IsTrunc = N0.getNode()->getConstantOperandVal(1) == 1; + // If the first fp_round isn't a value preserving truncation, it might + // introduce a tie in the second fp_round, that wouldn't occur in the + // single-step fp_round we want to fold to. + // In other words, double rounding isn't the same as rounding. + // Also, this is a value preserving truncation iff both fp_round's are. + if (DAG.getTarget().Options.UnsafeFPMath || N0IsTrunc) + return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, N0.getOperand(0), + DAG.getIntPtrConstant(NIsTrunc && N0IsTrunc)); } // fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y) @@ -7594,7 +8059,7 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) { // fold (fpext (load x)) -> (fpext (fptrunc (extload x))) if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && - TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType())) { + TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, LN0->getChain(), @@ -9126,7 +9591,7 @@ CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) { if (NotMaskLZ == 64) return Result; // All zero mask. // See if we have a continuous run of bits. If so, we have 0*1+0* - if (CountTrailingOnes_64(NotMask >> NotMaskTZ)+NotMaskTZ+NotMaskLZ != 64) + if (countTrailingOnes(NotMask >> NotMaskTZ) + NotMaskTZ + NotMaskLZ != 64) return Result; // Adjust NotMaskLZ down to be from the actual size of the int instead of i64. @@ -9273,9 +9738,12 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) { unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1; unsigned NewBW = NextPowerOf2(MSB - ShAmt); EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW); + // The narrowing should be profitable, the load/store operation should be + // legal (or custom) and the store size should be equal to the NewVT width. while (NewBW < BitWidth && - !(TLI.isOperationLegalOrCustom(Opc, NewVT) && - TLI.isNarrowingProfitable(VT, NewVT))) { + (NewVT.getStoreSizeInBits() != NewBW || + !TLI.isOperationLegalOrCustom(Opc, NewVT) || + !TLI.isNarrowingProfitable(VT, NewVT))) { NewBW = NextPowerOf2(NewBW); NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW); } @@ -9475,36 +9943,136 @@ struct BaseIndexOffset { } }; -/// Holds a pointer to an LSBaseSDNode as well as information on where it -/// is located in a sequence of memory operations connected by a chain. -struct MemOpLink { - MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq): - MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { } - // Ptr to the mem node. - LSBaseSDNode *MemNode; - // Offset from the base ptr. - int64_t OffsetFromBase; - // What is the sequence number of this mem node. - // Lowest mem operand in the DAG starts at zero. - unsigned SequenceNum; -}; +bool DAGCombiner::MergeStoresOfConstantsOrVecElts( + SmallVectorImpl &StoreNodes, EVT MemVT, + unsigned NumElem, bool IsConstantSrc, bool UseVector) { + // Make sure we have something to merge. + if (NumElem < 2) + return false; + + int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8; + LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; + unsigned EarliestNodeUsed = 0; + + for (unsigned i=0; i < NumElem; ++i) { + // Find a chain for the new wide-store operand. Notice that some + // of the store nodes that we found may not be selected for inclusion + // in the wide store. The chain we use needs to be the chain of the + // earliest store node which is *used* and replaced by the wide store. + if (StoreNodes[i].SequenceNum > StoreNodes[EarliestNodeUsed].SequenceNum) + EarliestNodeUsed = i; + } + + // The earliest Node in the DAG. + LSBaseSDNode *EarliestOp = StoreNodes[EarliestNodeUsed].MemNode; + SDLoc DL(StoreNodes[0].MemNode); + + SDValue StoredVal; + if (UseVector) { + // Find a legal type for the vector store. + EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem); + assert(TLI.isTypeLegal(Ty) && "Illegal vector store"); + if (IsConstantSrc) { + // A vector store with a constant source implies that the constant is + // zero; we only handle merging stores of constant zeros because the zero + // can be materialized without a load. + // It may be beneficial to loosen this restriction to allow non-zero + // store merging. + StoredVal = DAG.getConstant(0, Ty); + } else { + SmallVector Ops; + for (unsigned i = 0; i < NumElem ; ++i) { + StoreSDNode *St = cast(StoreNodes[i].MemNode); + SDValue Val = St->getValue(); + // All of the operands of a BUILD_VECTOR must have the same type. + if (Val.getValueType() != MemVT) + return false; + Ops.push_back(Val); + } + + // Build the extracted vector elements back into a vector. + StoredVal = DAG.getNode(ISD::BUILD_VECTOR, DL, Ty, Ops); + } + } else { + // We should always use a vector store when merging extracted vector + // elements, so this path implies a store of constants. + assert(IsConstantSrc && "Merged vector elements should use vector store"); + + unsigned StoreBW = NumElem * ElementSizeBytes * 8; + APInt StoreInt(StoreBW, 0); + + // Construct a single integer constant which is made of the smaller + // constant inputs. + bool IsLE = TLI.isLittleEndian(); + for (unsigned i = 0; i < NumElem ; ++i) { + unsigned Idx = IsLE ? (NumElem - 1 - i) : i; + StoreSDNode *St = cast(StoreNodes[Idx].MemNode); + SDValue Val = St->getValue(); + StoreInt <<= ElementSizeBytes*8; + if (ConstantSDNode *C = dyn_cast(Val)) { + StoreInt |= C->getAPIntValue().zext(StoreBW); + } else if (ConstantFPSDNode *C = dyn_cast(Val)) { + StoreInt |= C->getValueAPF().bitcastToAPInt().zext(StoreBW); + } else { + llvm_unreachable("Invalid constant element type"); + } + } + + // Create the new Load and Store operations. + EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW); + StoredVal = DAG.getConstant(StoreInt, StoreTy); + } + + SDValue NewStore = DAG.getStore(EarliestOp->getChain(), DL, StoredVal, + FirstInChain->getBasePtr(), + FirstInChain->getPointerInfo(), + false, false, + FirstInChain->getAlignment()); + + // Replace the first store with the new store + CombineTo(EarliestOp, NewStore); + // Erase all other stores. + for (unsigned i = 0; i < NumElem ; ++i) { + if (StoreNodes[i].MemNode == EarliestOp) + continue; + StoreSDNode *St = cast(StoreNodes[i].MemNode); + // ReplaceAllUsesWith will replace all uses that existed when it was + // called, but graph optimizations may cause new ones to appear. For + // example, the case in pr14333 looks like + // + // St's chain -> St -> another store -> X + // + // And the only difference from St to the other store is the chain. + // When we change it's chain to be St's chain they become identical, + // get CSEed and the net result is that X is now a use of St. + // Since we know that St is redundant, just iterate. + while (!St->use_empty()) + DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain()); + deleteAndRecombine(St); + } + + return true; +} bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { EVT MemVT = St->getMemoryVT(); int64_t ElementSizeBytes = MemVT.getSizeInBits()/8; - bool NoVectors = DAG.getMachineFunction().getFunction()->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); + bool NoVectors = DAG.getMachineFunction().getFunction()->hasFnAttribute( + Attribute::NoImplicitFloat); // Don't merge vectors into wider inputs. if (MemVT.isVector() || !MemVT.isSimple()) return false; // Perform an early exit check. Do not bother looking at stored values that - // are not constants or loads. + // are not constants, loads, or extracted vector elements. SDValue StoredVal = St->getValue(); bool IsLoadSrc = isa(StoredVal); - if (!isa(StoredVal) && !isa(StoredVal) && - !IsLoadSrc) + bool IsConstantSrc = isa(StoredVal) || + isa(StoredVal); + bool IsExtractVecEltSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT); + + if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecEltSrc) return false; // Only look at ends of store sequences. @@ -9646,7 +10214,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; // Store the constants into memory as one consecutive store. - if (!IsLoadSrc) { + if (IsConstantSrc) { unsigned LastLegalType = 0; unsigned LastLegalVectorType = 0; bool NonZero = false; @@ -9695,85 +10263,33 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors; unsigned NumElem = UseVector ? LastLegalVectorType : LastLegalType; - // Make sure we have something to merge. - if (NumElem < 2) - return false; - - unsigned EarliestNodeUsed = 0; - for (unsigned i=0; i < NumElem; ++i) { - // Find a chain for the new wide-store operand. Notice that some - // of the store nodes that we found may not be selected for inclusion - // in the wide store. The chain we use needs to be the chain of the - // earliest store node which is *used* and replaced by the wide store. - if (StoreNodes[i].SequenceNum > StoreNodes[EarliestNodeUsed].SequenceNum) - EarliestNodeUsed = i; - } - - // The earliest Node in the DAG. - LSBaseSDNode *EarliestOp = StoreNodes[EarliestNodeUsed].MemNode; - SDLoc DL(StoreNodes[0].MemNode); + return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem, + true, UseVector); + } - SDValue StoredVal; - if (UseVector) { + // When extracting multiple vector elements, try to store them + // in one vector store rather than a sequence of scalar stores. + if (IsExtractVecEltSrc) { + unsigned NumElem = 0; + for (unsigned i = 0; i < LastConsecutiveStore + 1; ++i) { + StoreSDNode *St = cast(StoreNodes[i].MemNode); + SDValue StoredVal = St->getValue(); + // This restriction could be loosened. + // Bail out if any stored values are not elements extracted from a vector. + // It should be possible to handle mixed sources, but load sources need + // more careful handling (see the block of code below that handles + // consecutive loads). + if (StoredVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return false; + // Find a legal type for the vector store. - EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem); - assert(TLI.isTypeLegal(Ty) && "Illegal vector store"); - StoredVal = DAG.getConstant(0, Ty); - } else { - unsigned StoreBW = NumElem * ElementSizeBytes * 8; - APInt StoreInt(StoreBW, 0); - - // Construct a single integer constant which is made of the smaller - // constant inputs. - bool IsLE = TLI.isLittleEndian(); - for (unsigned i = 0; i < NumElem ; ++i) { - unsigned Idx = IsLE ?(NumElem - 1 - i) : i; - StoreSDNode *St = cast(StoreNodes[Idx].MemNode); - SDValue Val = St->getValue(); - StoreInt<<=ElementSizeBytes*8; - if (ConstantSDNode *C = dyn_cast(Val)) { - StoreInt|=C->getAPIntValue().zext(StoreBW); - } else if (ConstantFPSDNode *C = dyn_cast(Val)) { - StoreInt|= C->getValueAPF().bitcastToAPInt().zext(StoreBW); - } else { - llvm_unreachable("Invalid constant element type"); - } - } - - // Create the new Load and Store operations. - EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW); - StoredVal = DAG.getConstant(StoreInt, StoreTy); - } - - SDValue NewStore = DAG.getStore(EarliestOp->getChain(), DL, StoredVal, - FirstInChain->getBasePtr(), - FirstInChain->getPointerInfo(), - false, false, - FirstInChain->getAlignment()); - - // Replace the first store with the new store - CombineTo(EarliestOp, NewStore); - // Erase all other stores. - for (unsigned i = 0; i < NumElem ; ++i) { - if (StoreNodes[i].MemNode == EarliestOp) - continue; - StoreSDNode *St = cast(StoreNodes[i].MemNode); - // ReplaceAllUsesWith will replace all uses that existed when it was - // called, but graph optimizations may cause new ones to appear. For - // example, the case in pr14333 looks like - // - // St's chain -> St -> another store -> X - // - // And the only difference from St to the other store is the chain. - // When we change it's chain to be St's chain they become identical, - // get CSEed and the net result is that X is now a use of St. - // Since we know that St is redundant, just iterate. - while (!St->use_empty()) - DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain()); - deleteAndRecombine(St); + EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1); + if (TLI.isTypeLegal(Ty)) + NumElem = i + 1; } - return true; + return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem, + false, true); } // Below we handle the case of multiple consecutive stores that @@ -9871,9 +10387,9 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { EVT LegalizedStoredValueTy = TLI.getTypeToTransformTo(*DAG.getContext(), StoreTy); if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && - TLI.isLoadExtLegal(ISD::ZEXTLOAD, StoreTy) && - TLI.isLoadExtLegal(ISD::SEXTLOAD, StoreTy) && - TLI.isLoadExtLegal(ISD::EXTLOAD, StoreTy)) + TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValueTy, StoreTy) && + TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy, StoreTy) && + TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy)) LastLegalIntegerType = i+1; } } @@ -10311,7 +10827,8 @@ SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad( if (ResultVT.bitsGT(VecEltVT)) { // If the result type of vextract is wider than the load, then issue an // extending load instead. - ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, VecEltVT) + ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, ResultVT, + VecEltVT) ? ISD::ZEXTLOAD : ISD::EXTLOAD; Load = DAG.getExtLoad( @@ -10677,6 +11194,11 @@ SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) { if (!TLI.isOperationLegalOrCustom(Opcode, NVT)) return SDValue(); + // Just because the floating-point vector type is legal does not necessarily + // mean that the corresponding integer vector type is. + if (!isTypeLegal(NVT)) + return SDValue(); + SmallVector Opnds; for (unsigned i = 0; i != NumInScalars; ++i) { SDValue In = N->getOperand(i); @@ -10908,36 +11430,51 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { } } + // Fold any combination of BUILD_VECTOR or UNDEF nodes into one BUILD_VECTOR. + // We have already tested above for an UNDEF only concatenation. // fold (concat_vectors (BUILD_VECTOR A, B, ...), (BUILD_VECTOR C, D, ...)) // -> (BUILD_VECTOR A, B, ..., C, D, ...) - if (N->getNumOperands() == 2 && - N->getOperand(0).getOpcode() == ISD::BUILD_VECTOR && - N->getOperand(1).getOpcode() == ISD::BUILD_VECTOR) { - EVT VT = N->getValueType(0); - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); + auto IsBuildVectorOrUndef = [](const SDValue &Op) { + return ISD::UNDEF == Op.getOpcode() || ISD::BUILD_VECTOR == Op.getOpcode(); + }; + bool AllBuildVectorsOrUndefs = + std::all_of(N->op_begin(), N->op_end(), IsBuildVectorOrUndef); + if (AllBuildVectorsOrUndefs) { SmallVector Opnds; - unsigned BuildVecNumElts = N0.getNumOperands(); - - EVT SclTy0 = N0.getOperand(0)->getValueType(0); - EVT SclTy1 = N1.getOperand(0)->getValueType(0); - if (SclTy0.isFloatingPoint()) { - for (unsigned i = 0; i != BuildVecNumElts; ++i) - Opnds.push_back(N0.getOperand(i)); - for (unsigned i = 0; i != BuildVecNumElts; ++i) - Opnds.push_back(N1.getOperand(i)); - } else { + EVT SVT = VT.getScalarType(); + + EVT MinVT = SVT; + if (!SVT.isFloatingPoint()) // If BUILD_VECTOR are from built from integer, they may have different // operand types. Get the smaller type and truncate all operands to it. - EVT MinTy = SclTy0.bitsLE(SclTy1) ? SclTy0 : SclTy1; - for (unsigned i = 0; i != BuildVecNumElts; ++i) - Opnds.push_back(DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinTy, - N0.getOperand(i))); - for (unsigned i = 0; i != BuildVecNumElts; ++i) - Opnds.push_back(DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinTy, - N1.getOperand(i))); + for (const SDValue &Op : N->ops()) { + EVT OpSVT = Op.getValueType().getScalarType(); + MinVT = MinVT.bitsLE(OpSVT) ? MinVT : OpSVT; + } + + for (const SDValue &Op : N->ops()) { + EVT OpVT = Op.getValueType(); + unsigned NumElts = OpVT.getVectorNumElements(); + + if (ISD::UNDEF == Op.getOpcode()) + for (unsigned i = 0; i != NumElts; ++i) + Opnds.push_back(DAG.getUNDEF(MinVT)); + + if (ISD::BUILD_VECTOR == Op.getOpcode()) { + if (SVT.isFloatingPoint()) { + assert(SVT == OpVT.getScalarType() && "Concat vector type mismatch"); + for (unsigned i = 0; i != NumElts; ++i) + Opnds.push_back(Op.getOperand(i)); + } else { + for (unsigned i = 0; i != NumElts; ++i) + Opnds.push_back( + DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinVT, Op.getOperand(i))); + } + } } + assert(VT.getVectorNumElements() == Opnds.size() && + "Concat vector type mismatch"); return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, Opnds); } @@ -11133,7 +11670,8 @@ static SDValue simplifyShuffleOperands(ShuffleVectorSDNode *SVN, SDValue N0, return DAG.getVectorShuffle(VT, SDLoc(SVN), S0, S1, SVN->getMask()); } -// Tries to turn a shuffle of two CONCAT_VECTORS into a single concat. +// Tries to turn a shuffle of two CONCAT_VECTORS into a single concat, +// or turn a shuffle of a single concat into simpler shuffle then concat. static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); @@ -11147,6 +11685,18 @@ static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) { unsigned NumElemsPerConcat = ConcatVT.getVectorNumElements(); unsigned NumConcats = NumElts / NumElemsPerConcat; + // Special case: shuffle(concat(A,B)) can be more efficiently represented + // as concat(shuffle(A,B),UNDEF) if the shuffle doesn't set any of the high + // half vector elements. + if (NumElemsPerConcat * 2 == NumElts && N1.getOpcode() == ISD::UNDEF && + std::all_of(SVN->getMask().begin() + NumElemsPerConcat, + SVN->getMask().end(), [](int i) { return i == -1; })) { + N0 = DAG.getVectorShuffle(ConcatVT, SDLoc(N), N0.getOperand(0), N0.getOperand(1), + ArrayRef(SVN->getMask().begin(), NumElemsPerConcat)); + N1 = DAG.getUNDEF(ConcatVT); + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0, N1); + } + // Look at every vector that's inserted. We're looking for exact // subvector-sized copies from a concatenated vector for (unsigned I = 0; I != NumConcats; ++I) { @@ -11245,7 +11795,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { } // If it is a splat, check if the argument vector is another splat or a - // build_vector with all scalar elements the same. + // build_vector. if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) { SDNode *V = N0.getNode(); @@ -11282,6 +11832,18 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { // Splat of , return if (AllSame) return N0; + + // Canonicalize any other splat as a build_vector. + const SDValue &Splatted = V->getOperand(SVN->getSplatIndex()); + SmallVector Ops(NumElts, Splatted); + SDValue NewBV = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), + V->getValueType(0), Ops); + + // We may have jumped through bitcasts, so the type of the + // BUILD_VECTOR may not match the type of the shuffle. + if (V->getValueType(0) != VT) + NewBV = DAG.getNode(ISD::BITCAST, SDLoc(N), VT, NewBV); + return NewBV; } } @@ -11329,8 +11891,9 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2) // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2) // Don't try to fold shuffles with illegal type. - if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG && - TLI.isTypeLegal(VT)) { + // Only fold if this shuffle is the only user of the other shuffle. + if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && N->isOnlyUserOf(N0.getNode()) && + Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) { ShuffleVectorSDNode *OtherSV = cast(N0); // The incoming shuffle must be of the same type as the result of the @@ -11494,9 +12057,11 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) { return SDValue(); } - // Let's see if the target supports this vector_shuffle. + // Let's see if the target supports this vector_shuffle and make sure + // we're not running after operation legalization where it may have + // custom lowered the vector shuffles. EVT RVT = RHS.getValueType(); - if (!TLI.isVectorClearMaskLegal(Indices, RVT)) + if (LegalOperations || !TLI.isVectorClearMaskLegal(Indices, RVT)) return SDValue(); // Return the new VECTOR_SHUFFLE node.