From 7a5e55509b99d579d56d126a7b503ec6fe153a8f Mon Sep 17 00:00:00 2001 From: Eli Friedman Date: Sun, 7 Jun 2009 06:52:44 +0000 Subject: [PATCH] Slightly generalize the code that handles shuffles of consecutive loads on x86 to handle more cases. Fix a bug in said code that would cause it to read past the end of an object. Rewrite the code in SelectionDAGLegalize::ExpandBUILD_VECTOR to be a bit more general. Remove PerformBuildVectorCombine, which is no longer necessary with these changes. In addition to simplifying the code, with this change, we can now catch a few more cases of consecutive loads. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@73012 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 115 ++++++------------ lib/Target/X86/X86ISelLowering.cpp | 143 +++++++---------------- test/CodeGen/X86/vec_loadsingles.ll | 12 ++ test/CodeGen/X86/vec_set-5.ll | 3 +- test/CodeGen/X86/vec_set-6.ll | 2 +- 5 files changed, 90 insertions(+), 185 deletions(-) create mode 100644 test/CodeGen/X86/vec_loadsingles.ll diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 5ae183e2fa0..f3c2833e0fe 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1785,48 +1785,41 @@ SDValue SelectionDAGLegalize::ExpandSCALAR_TO_VECTOR(SDNode *Node) { /// support the operation, but do support the resultant vector type. SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) { unsigned NumElems = Node->getNumOperands(); - SDValue SplatValue = Node->getOperand(0); + SDValue Value1, Value2; DebugLoc dl = Node->getDebugLoc(); MVT VT = Node->getValueType(0); - MVT OpVT = SplatValue.getValueType(); + MVT OpVT = Node->getOperand(0).getValueType(); MVT EltVT = VT.getVectorElementType(); // If the only non-undef value is the low element, turn this into a // SCALAR_TO_VECTOR node. If this is { X, X, X, X }, determine X. bool isOnlyLowElement = true; - - // FIXME: it would be far nicer to change this into map - // and use a bitmask instead of a list of elements. - // FIXME: this doesn't treat <0, u, 0, u> for example, as a splat. - std::map > Values; - Values[SplatValue].push_back(0); + bool MoreThanTwoValues = false; bool isConstant = true; - if (!isa(SplatValue) && !isa(SplatValue) && - SplatValue.getOpcode() != ISD::UNDEF) - isConstant = false; - - for (unsigned i = 1; i < NumElems; ++i) { + for (unsigned i = 0; i < NumElems; ++i) { SDValue V = Node->getOperand(i); - Values[V].push_back(i); - if (V.getOpcode() != ISD::UNDEF) + if (V.getOpcode() == ISD::UNDEF) + continue; + if (i > 0) isOnlyLowElement = false; - if (SplatValue != V) - SplatValue = SDValue(0, 0); - - // If this isn't a constant element or an undef, we can't use a constant - // pool load. - if (!isa(V) && !isa(V) && - V.getOpcode() != ISD::UNDEF) + if (!isa(V) && !isa(V)) isConstant = false; + + if (!Value1.getNode()) { + Value1 = V; + } else if (!Value2.getNode()) { + if (V != Value1) + Value2 = V; + } else if (V != Value1 && V != Value2) { + MoreThanTwoValues = true; + } } - if (isOnlyLowElement) { - // If the low element is an undef too, then this whole things is an undef. - if (Node->getOperand(0).getOpcode() == ISD::UNDEF) - return DAG.getUNDEF(VT); - // Otherwise, turn this into a scalar_to_vector node. + if (!Value1.getNode()) + return DAG.getUNDEF(VT); + + if (isOnlyLowElement) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Node->getOperand(0)); - } // If all elements are constants, create a load from the constant pool. if (isConstant) { @@ -1852,59 +1845,25 @@ SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) { false, Alignment); } - if (SplatValue.getNode()) { // Splat of one value? - // Build the shuffle constant vector: <0, 0, 0, 0> - SmallVector ZeroVec(NumElems, 0); - - // If the target supports VECTOR_SHUFFLE and this shuffle mask, use it. - if (TLI.isShuffleMaskLegal(ZeroVec, Node->getValueType(0))) { + if (!MoreThanTwoValues) { + SmallVector ShuffleVec(NumElems, -1); + for (unsigned i = 0; i < NumElems; ++i) { + SDValue V = Node->getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + ShuffleVec[i] = V == Value1 ? 0 : NumElems; + } + if (TLI.isShuffleMaskLegal(ShuffleVec, Node->getValueType(0))) { // Get the splatted value into the low element of a vector register. - SDValue LowValVec = - DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, SplatValue); + SDValue Vec1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value1); + SDValue Vec2; + if (Value2.getNode()) + Vec2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value2); + else + Vec2 = DAG.getUNDEF(VT); // Return shuffle(LowValVec, undef, <0,0,0,0>) - return DAG.getVectorShuffle(VT, dl, LowValVec, DAG.getUNDEF(VT), - &ZeroVec[0]); - } - } - - // If there are only two unique elements, we may be able to turn this into a - // vector shuffle. - if (Values.size() == 2) { - // Get the two values in deterministic order. - SDValue Val1 = Node->getOperand(1); - SDValue Val2; - std::map >::iterator MI = Values.begin(); - if (MI->first != Val1) - Val2 = MI->first; - else - Val2 = (++MI)->first; - - // If Val1 is an undef, make sure it ends up as Val2, to ensure that our - // vector shuffle has the undef vector on the RHS. - if (Val1.getOpcode() == ISD::UNDEF) - std::swap(Val1, Val2); - - // Build the shuffle constant vector: e.g. <0, 4, 0, 4> - SmallVector ShuffleMask(NumElems, -1); - - // Set elements of the shuffle mask for Val1. - std::vector &Val1Elts = Values[Val1]; - for (unsigned i = 0, e = Val1Elts.size(); i != e; ++i) - ShuffleMask[Val1Elts[i]] = 0; - - // Set elements of the shuffle mask for Val2. - std::vector &Val2Elts = Values[Val2]; - for (unsigned i = 0, e = Val2Elts.size(); i != e; ++i) - if (Val2.getOpcode() != ISD::UNDEF) - ShuffleMask[Val2Elts[i]] = NumElems; - - // If the target supports SCALAR_TO_VECTOR and this shuffle mask, use it. - if (TLI.isOperationLegalOrCustom(ISD::SCALAR_TO_VECTOR, VT) && - TLI.isShuffleMaskLegal(ShuffleMask, VT)) { - Val1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Val1); - Val2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Val2); - return DAG.getVectorShuffle(VT, dl, Val1, Val2, &ShuffleMask[0]); + return DAG.getVectorShuffle(VT, dl, Vec1, Vec2, ShuffleVec.data()); } } diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index ef60ff5e092..7d18b968f78 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7691,13 +7691,15 @@ static bool isBaseAlignmentOfN(unsigned N, SDNode *Base, } static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems, - MVT EVT, SDNode *&Base, + MVT EVT, LoadSDNode *&LDBase, + unsigned &LastLoadedElt, SelectionDAG &DAG, MachineFrameInfo *MFI, const TargetLowering &TLI) { - Base = NULL; + LDBase = NULL; + LastLoadedElt = -1; for (unsigned i = 0; i < NumElems; ++i) { if (N->getMaskElt(i) < 0) { - if (!Base) + if (!LDBase) return false; continue; } @@ -7706,19 +7708,20 @@ static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems, if (!Elt.getNode() || (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) return false; - if (!Base) { - Base = Elt.getNode(); - if (Base->getOpcode() == ISD::UNDEF) + if (!LDBase) { + if (Elt.getNode()->getOpcode() == ISD::UNDEF) return false; + LDBase = cast(Elt.getNode()); + LastLoadedElt = i; continue; } if (Elt.getOpcode() == ISD::UNDEF) continue; LoadSDNode *LD = cast(Elt); - LoadSDNode *LDBase = cast(Base); if (!TLI.isConsecutiveLoad(LD, LDBase, EVT.getSizeInBits()/8, i, MFI)) return false; + LastLoadedElt = i; } return true; } @@ -7737,6 +7740,9 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, ShuffleVectorSDNode *SVN = cast(N); unsigned NumElems = VT.getVectorNumElements(); + if (VT.getSizeInBits() != 128) + return SDValue(); + // For x86-32 machines, if we see an insert and then a shuffle in a v2i64 // where the upper half is 0, it is advantageous to rewrite it as a build // vector of (0, val) so it can use movq. @@ -7764,107 +7770,24 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, // Try to combine a vector_shuffle into a 128-bit load. MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); - SDNode *Base = NULL; - if (!EltsFromConsecutiveLoads(SVN, NumElems, EVT, Base, DAG, MFI, TLI)) + LoadSDNode *LD = NULL; + unsigned LastLoadedElt; + if (!EltsFromConsecutiveLoads(SVN, NumElems, EVT, LD, LastLoadedElt, DAG, + MFI, TLI)) return SDValue(); - LoadSDNode *LD = cast(Base); - if (isBaseAlignmentOfN(16, Base->getOperand(1).getNode(), TLI)) + if (LastLoadedElt == NumElems - 1) { + if (isBaseAlignmentOfN(16, LD->getBasePtr().getNode(), TLI)) + return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), + LD->getSrcValue(), LD->getSrcValueOffset(), + LD->isVolatile()); return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), LD->getSrcValue(), LD->getSrcValueOffset(), - LD->isVolatile()); - return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), - LD->getSrcValue(), LD->getSrcValueOffset(), - LD->isVolatile(), LD->getAlignment()); -} - -/// PerformBuildVectorCombine - build_vector 0,(load i64 / f64) -> movq / movsd. -static SDValue PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget, - const TargetLowering &TLI) { - unsigned NumOps = N->getNumOperands(); - DebugLoc dl = N->getDebugLoc(); - - // Ignore single operand BUILD_VECTOR. - if (NumOps == 1) - return SDValue(); - - MVT VT = N->getValueType(0); - MVT EVT = VT.getVectorElementType(); - - // Before or during type legalization, we want to try and convert a - // build_vector of an i64 load and a zero value into vzext_movl before the - // legalizer can break it up. - // FIXME: does the case below remove the need to do this? - if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) { - if ((EVT != MVT::i64 && EVT != MVT::f64) || Subtarget->is64Bit()) - return SDValue(); - - // This must be an insertion into a zero vector. - SDValue HighElt = N->getOperand(1); - if (!isZeroNode(HighElt)) - return SDValue(); - - // Value must be a load. - SDNode *Base = N->getOperand(0).getNode(); - if (!isa(Base)) { - if (Base->getOpcode() != ISD::BIT_CONVERT) - return SDValue(); - Base = Base->getOperand(0).getNode(); - if (!isa(Base)) - return SDValue(); - } - - // Transform it into VZEXT_LOAD addr. - LoadSDNode *LD = cast(Base); - - // Load must not be an extload. - if (LD->getExtensionType() != ISD::NON_EXTLOAD) - return SDValue(); - - // Load type should legal type so we don't have to legalize it. - if (!TLI.isTypeLegal(VT)) - return SDValue(); - - SDVTList Tys = DAG.getVTList(VT, MVT::Other); - SDValue Ops[] = { LD->getChain(), LD->getBasePtr() }; - SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); - TargetLowering::TargetLoweringOpt TLO(DAG); - TLO.CombineTo(SDValue(Base, 1), ResNode.getValue(1)); - DCI.CommitTargetLoweringOpt(TLO); - return ResNode; - } - - // The type legalizer will have broken apart v2i64 build_vector created during - // widening before the code which handles that case is run. Look for build - // vector (load, load + 4, 0/undef, 0/undef) - if (VT == MVT::v4i32 || VT == MVT::v4f32) { - LoadSDNode *LD0 = dyn_cast(N->getOperand(0)); - LoadSDNode *LD1 = dyn_cast(N->getOperand(1)); - if (!LD0 || !LD1) - return SDValue(); - if (LD0->getExtensionType() != ISD::NON_EXTLOAD || - LD1->getExtensionType() != ISD::NON_EXTLOAD) - return SDValue(); - // Make sure the second elt is a consecutive load. - if (!TLI.isConsecutiveLoad(LD1, LD0, EVT.getSizeInBits()/8, 1, - DAG.getMachineFunction().getFrameInfo())) - return SDValue(); - - SDValue N2 = N->getOperand(2); - SDValue N3 = N->getOperand(3); - if (!isZeroNode(N2) && N2.getOpcode() != ISD::UNDEF) - return SDValue(); - if (!isZeroNode(N3) && N3.getOpcode() != ISD::UNDEF) - return SDValue(); - + LD->isVolatile(), LD->getAlignment()); + } else if (NumElems == 4 && LastLoadedElt == 1) { SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); - SDValue Ops[] = { LD0->getChain(), LD0->getBasePtr() }; + SDValue Ops[] = { LD->getChain(), LD->getBasePtr() }; SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); - TargetLowering::TargetLoweringOpt TLO(DAG); - TLO.CombineTo(SDValue(LD0, 1), ResNode.getValue(1)); - DCI.CommitTargetLoweringOpt(TLO); return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); } return SDValue(); @@ -8466,14 +8389,25 @@ static SDValue PerformBTCombine(SDNode *N, return SDValue(); } +static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { + SDValue Op = N->getOperand(0); + if (Op.getOpcode() == ISD::BIT_CONVERT) + Op = Op.getOperand(0); + MVT VT = N->getValueType(0), OpVT = Op.getValueType(); + if (Op.getOpcode() == X86ISD::VZEXT_LOAD && + VT.getVectorElementType().getSizeInBits() == + OpVT.getVectorElementType().getSizeInBits()) { + return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); + } + return SDValue(); +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; switch (N->getOpcode()) { default: break; case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); - case ISD::BUILD_VECTOR: - return PerformBuildVectorCombine(N, DAG, DCI, Subtarget, *this); case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); case ISD::MUL: return PerformMulCombine(N, DAG, DCI); @@ -8485,6 +8419,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::FOR: return PerformFORCombine(N, DAG); case X86ISD::FAND: return PerformFANDCombine(N, DAG); case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); + case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); } return SDValue(); diff --git a/test/CodeGen/X86/vec_loadsingles.ll b/test/CodeGen/X86/vec_loadsingles.ll new file mode 100644 index 00000000000..67122763ec9 --- /dev/null +++ b/test/CodeGen/X86/vec_loadsingles.ll @@ -0,0 +1,12 @@ +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep movq + +define <4 x float> @a(<4 x float> %a, float* nocapture %p) nounwind readonly { +entry: + %tmp1 = load float* %p + %vecins = insertelement <4 x float> undef, float %tmp1, i32 0 + %add.ptr = getelementptr float* %p, i32 1 + %tmp5 = load float* %add.ptr + %vecins7 = insertelement <4 x float> %vecins, float %tmp5, i32 1 + ret <4 x float> %vecins7 +} + diff --git a/test/CodeGen/X86/vec_set-5.ll b/test/CodeGen/X86/vec_set-5.ll index 4fc652c022a..d3329701119 100644 --- a/test/CodeGen/X86/vec_set-5.ll +++ b/test/CodeGen/X86/vec_set-5.ll @@ -1,7 +1,6 @@ ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f ; RUN: grep movlhps %t | count 1 -; RUN: grep movq %t | count 1 -; RUN: grep movsd %t | count 1 +; RUN: grep movq %t | count 2 define <4 x float> @test1(float %a, float %b) nounwind { %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0 ; <<4 x float>> [#uses=1] diff --git a/test/CodeGen/X86/vec_set-6.ll b/test/CodeGen/X86/vec_set-6.ll index 02df526cee8..c7b6747a86f 100644 --- a/test/CodeGen/X86/vec_set-6.ll +++ b/test/CodeGen/X86/vec_set-6.ll @@ -1,6 +1,6 @@ ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f ; RUN: grep movss %t | count 1 -; RUN: grep movups %t | count 1 +; RUN: grep movq %t | count 1 ; RUN: grep shufps %t | count 1 define <4 x float> @test(float %a, float %b, float %c) nounwind { -- 2.34.1