From 9b4cc76745cff2f823e726981febd83a8b6e05b3 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Tue, 3 Feb 2015 18:54:00 +0000 Subject: [PATCH] Merge consecutive 16-byte loads into one 32-byte load (PR22329) This patch detects consecutive vector loads using the existing EltsFromConsecutiveLoads() logic. This fixes: http://llvm.org/bugs/show_bug.cgi?id=22329 This patch effectively reverts the tablegen additions of D6492 / http://reviews.llvm.org/rL224344 ...which in hindsight were a horrible hack. The test cases that were added with that patch are simply modified to load from varying offsets of a base pointer. These loads did not match the existing tablegen patterns. A happy side effect of doing this optimization earlier is that we can now fold the load into a math op where possible; this is shown in some of the updated checks in the test file. Differential Revision: http://reviews.llvm.org/D7303 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@228006 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 62 +++++++++++++----- lib/Target/X86/X86InstrSSE.td | 43 ------------- test/CodeGen/X86/unaligned-32-byte-memops.ll | 67 +++++++++++--------- 3 files changed, 85 insertions(+), 87 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index df7984578ab..bf216c767d2 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -6011,9 +6011,9 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { return SDValue(); } -/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a -/// vector of type 'VT', see if the elements can be replaced by a single large -/// load which has the same value as a build_vector whose operands are 'elts'. +/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the +/// elements can be replaced by a single large load which has the same value as +/// a build_vector or insert_subvector whose loaded operands are 'Elts'. /// /// Example: -> zextload a /// @@ -6023,7 +6023,6 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, SDLoc &DL, SelectionDAG &DAG, bool isAfterLegalize) { - EVT EltVT = VT.getVectorElementType(); unsigned NumElems = Elts.size(); LoadSDNode *LDBase = nullptr; @@ -6034,7 +6033,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, // non-consecutive, bail out. for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = Elts[i]; - + // Look through a bitcast. + if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST) + Elt = Elt.getOperand(0); if (!Elt.getNode() || (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) return SDValue(); @@ -6049,7 +6050,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, continue; LoadSDNode *LD = cast(Elt); - if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) + EVT LdVT = Elt.getValueType(); + // Each loaded element must be the correct fractional portion of the + // requested vector load. + if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems) + return SDValue(); + if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i)) return SDValue(); LastLoadedElt = i; } @@ -6058,6 +6064,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, // load of the entire vector width starting at the base pointer. If we found // consecutive loads for the low half, generate a vzext_load node. if (LastLoadedElt == NumElems - 1) { + assert(LDBase && "Did not find base load for merging consecutive loads"); + EVT EltVT = LDBase->getValueType(0); + // Ensure that the input vector size for the merged loads matches the + // cumulative size of the input elements. + if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems) + return SDValue(); if (isAfterLegalize && !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT)) @@ -6084,6 +6096,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, //TODO: The code below fires only for for loading the low v2i32 / v2f32 //of a v4i32 / v4f32. It's probably worth generalizing. + EVT EltVT = VT.getVectorElementType(); if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) && DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); @@ -13164,25 +13177,44 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { if (!Subtarget->hasAVX()) return SDValue(); - + SDLoc dl(Op); SDValue Vec = Op.getOperand(0); SDValue SubVec = Op.getOperand(1); SDValue Idx = Op.getOperand(2); + + if (!isa(Idx)) + return SDValue(); + + unsigned IdxVal = cast(Idx)->getZExtValue(); MVT OpVT = Op.getSimpleValueType(); MVT SubVecVT = SubVec.getSimpleValueType(); - + + // Fold two 16-byte subvector loads into one 32-byte load: + // (insert_subvector (insert_subvector undef, (load addr), 0), + // (load addr + 16), Elts/2) + // --> load32 addr + if ((IdxVal == OpVT.getVectorNumElements() / 2) && + Vec.getOpcode() == ISD::INSERT_SUBVECTOR && + OpVT.is256BitVector() && SubVecVT.is128BitVector() && + !Subtarget->isUnalignedMem32Slow()) { + SDValue SubVec2 = Vec.getOperand(1); + if (auto *Idx2 = dyn_cast(Vec.getOperand(2))) { + if (Idx2->getZExtValue() == 0) { + SDValue Ops[] = { SubVec2, SubVec }; + SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false); + if (LD.getNode()) + return LD; + } + } + } + if ((OpVT.is256BitVector() || OpVT.is512BitVector()) && - SubVecVT.is128BitVector() && isa(Idx)) { - unsigned IdxVal = cast(Idx)->getZExtValue(); + SubVecVT.is128BitVector()) return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); - } - if (OpVT.is512BitVector() && - SubVecVT.is256BitVector() && isa(Idx)) { - unsigned IdxVal = cast(Idx)->getZExtValue(); + if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); - } return SDValue(); } diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 0ab53f3183e..45656631830 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -8141,49 +8141,6 @@ def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2), (INSERT_get_vinsert128_imm VR256:$ins))>; } -// Combine two consecutive 16-byte loads with a common destination register into -// one 32-byte load to that register. -let Predicates = [HasAVX, HasFastMem32] in { - def : Pat<(insert_subvector - (v8f32 (insert_subvector undef, (loadv4f32 addr:$src), (iPTR 0))), - (loadv4f32 (add addr:$src, (iPTR 16))), - (iPTR 4)), - (VMOVUPSYrm addr:$src)>; - - def : Pat<(insert_subvector - (v4f64 (insert_subvector undef, (loadv2f64 addr:$src), (iPTR 0))), - (loadv2f64 (add addr:$src, (iPTR 16))), - (iPTR 2)), - (VMOVUPDYrm addr:$src)>; - - def : Pat<(insert_subvector - (v32i8 (insert_subvector - undef, (bc_v16i8 (loadv2i64 addr:$src)), (iPTR 0))), - (bc_v16i8 (loadv2i64 (add addr:$src, (iPTR 16)))), - (iPTR 16)), - (VMOVDQUYrm addr:$src)>; - - def : Pat<(insert_subvector - (v16i16 (insert_subvector - undef, (bc_v8i16 (loadv2i64 addr:$src)), (iPTR 0))), - (bc_v8i16 (loadv2i64 (add addr:$src, (iPTR 16)))), - (iPTR 8)), - (VMOVDQUYrm addr:$src)>; - - def : Pat<(insert_subvector - (v8i32 (insert_subvector - undef, (bc_v4i32 (loadv2i64 addr:$src)), (iPTR 0))), - (bc_v4i32 (loadv2i64 (add addr:$src, (iPTR 16)))), - (iPTR 4)), - (VMOVDQUYrm addr:$src)>; - - def : Pat<(insert_subvector - (v4i64 (insert_subvector undef, (loadv2i64 addr:$src), (iPTR 0))), - (loadv2i64 (add addr:$src, (iPTR 16))), - (iPTR 2)), - (VMOVDQUYrm addr:$src)>; -} - let Predicates = [HasAVX1Only] in { def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR imm)), diff --git a/test/CodeGen/X86/unaligned-32-byte-memops.ll b/test/CodeGen/X86/unaligned-32-byte-memops.ll index 347f330d67a..9cec17d9432 100644 --- a/test/CodeGen/X86/unaligned-32-byte-memops.ll +++ b/test/CodeGen/X86/unaligned-32-byte-memops.ll @@ -65,8 +65,9 @@ define <8 x float> @combine_16_byte_loads(<4 x float>* %ptr) { ; HASWELL: vmovups ; HASWELL-NEXT: retq - %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1 - %v1 = load <4 x float>* %ptr, align 1 + %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 1 + %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 2 + %v1 = load <4 x float>* %ptr1, align 1 %v2 = load <4 x float>* %ptr2, align 1 %shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1) @@ -88,8 +89,9 @@ define <8 x float> @combine_16_byte_loads_swap(<4 x float>* %ptr) { ; HASWELL: vmovups ; HASWELL-NEXT: retq - %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1 - %v1 = load <4 x float>* %ptr, align 1 + %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 2 + %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 3 + %v1 = load <4 x float>* %ptr1, align 1 %v2 = load <4 x float>* %ptr2, align 1 %shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0) @@ -111,8 +113,9 @@ define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) { ; HASWELL: vmovups ; HASWELL-NEXT: retq - %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1 - %v1 = load <4 x float>* %ptr, align 1 + %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 3 + %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 4 + %v1 = load <4 x float>* %ptr1, align 1 %v2 = load <4 x float>* %ptr2, align 1 %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> ret <8 x float> %v3 @@ -133,8 +136,9 @@ define <8 x float> @combine_16_byte_loads_no_intrinsic_swap(<4 x float>* %ptr) { ; HASWELL: vmovups ; HASWELL-NEXT: retq - %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1 - %v1 = load <4 x float>* %ptr, align 1 + %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 4 + %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 5 + %v1 = load <4 x float>* %ptr1, align 1 %v2 = load <4 x float>* %ptr2, align 1 %v3 = shufflevector <4 x float> %v2, <4 x float> %v1, <8 x i32> ret <8 x float> %v3 @@ -160,12 +164,13 @@ define <4 x i64> @combine_16_byte_loads_i64(<2 x i64>* %ptr, <4 x i64> %x) { ; BTVER2-NEXT: vinsertf128 ; BTVER2-NEXT: retq - ; HASWELL: vmovdqu - ; HASWELL-NEXT: vpaddq + ; HASWELL-NOT: vextract + ; HASWELL: vpaddq ; HASWELL-NEXT: retq - %ptr2 = getelementptr inbounds <2 x i64>* %ptr, i64 1 - %v1 = load <2 x i64>* %ptr, align 1 + %ptr1 = getelementptr inbounds <2 x i64>* %ptr, i64 5 + %ptr2 = getelementptr inbounds <2 x i64>* %ptr, i64 6 + %v1 = load <2 x i64>* %ptr1, align 1 %v2 = load <2 x i64>* %ptr2, align 1 %v3 = shufflevector <2 x i64> %v1, <2 x i64> %v2, <4 x i32> %v4 = add <4 x i64> %v3, %x @@ -187,12 +192,13 @@ define <8 x i32> @combine_16_byte_loads_i32(<4 x i32>* %ptr, <8 x i32> %x) { ; BTVER2-NEXT: vinsertf128 ; BTVER2-NEXT: retq - ; HASWELL: vmovdqu - ; HASWELL-NEXT: vpaddd + ; HASWELL-NOT: vextract + ; HASWELL: vpaddd ; HASWELL-NEXT: retq - %ptr2 = getelementptr inbounds <4 x i32>* %ptr, i64 1 - %v1 = load <4 x i32>* %ptr, align 1 + %ptr1 = getelementptr inbounds <4 x i32>* %ptr, i64 6 + %ptr2 = getelementptr inbounds <4 x i32>* %ptr, i64 7 + %v1 = load <4 x i32>* %ptr1, align 1 %v2 = load <4 x i32>* %ptr2, align 1 %v3 = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> %v4 = add <8 x i32> %v3, %x @@ -214,12 +220,13 @@ define <16 x i16> @combine_16_byte_loads_i16(<8 x i16>* %ptr, <16 x i16> %x) { ; BTVER2-NEXT: vinsertf128 ; BTVER2-NEXT: retq - ; HASWELL: vmovdqu - ; HASWELL-NEXT: vpaddw + ; HASWELL-NOT: vextract + ; HASWELL: vpaddw ; HASWELL-NEXT: retq - %ptr2 = getelementptr inbounds <8 x i16>* %ptr, i64 1 - %v1 = load <8 x i16>* %ptr, align 1 + %ptr1 = getelementptr inbounds <8 x i16>* %ptr, i64 7 + %ptr2 = getelementptr inbounds <8 x i16>* %ptr, i64 8 + %v1 = load <8 x i16>* %ptr1, align 1 %v2 = load <8 x i16>* %ptr2, align 1 %v3 = shufflevector <8 x i16> %v1, <8 x i16> %v2, <16 x i32> %v4 = add <16 x i16> %v3, %x @@ -241,12 +248,13 @@ define <32 x i8> @combine_16_byte_loads_i8(<16 x i8>* %ptr, <32 x i8> %x) { ; BTVER2-NEXT: vinsertf128 ; BTVER2-NEXT: retq - ; HASWELL: vmovdqu - ; HASWELL-NEXT: vpaddb + ; HASWELL-NOT: vextract + ; HASWELL: vpaddb ; HASWELL-NEXT: retq - %ptr2 = getelementptr inbounds <16 x i8>* %ptr, i64 1 - %v1 = load <16 x i8>* %ptr, align 1 + %ptr1 = getelementptr inbounds <16 x i8>* %ptr, i64 8 + %ptr2 = getelementptr inbounds <16 x i8>* %ptr, i64 9 + %v1 = load <16 x i8>* %ptr1, align 1 %v2 = load <16 x i8>* %ptr2, align 1 %v3 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> %v4 = add <32 x i8> %v3, %x @@ -261,16 +269,17 @@ define <4 x double> @combine_16_byte_loads_double(<2 x double>* %ptr, <4 x doubl ; SANDYB-NEXT: vaddpd ; SANDYB-NEXT: retq - ; BTVER2: vmovupd - ; BTVER2-NEXT: vaddpd + ; BTVER2-NOT: vinsertf128 + ; BTVER2: vaddpd ; BTVER2-NEXT: retq - ; HASWELL: vmovupd + ; HASWELL-NOT: vinsertf128 ; HASWELL: vaddpd ; HASWELL-NEXT: retq - %ptr2 = getelementptr inbounds <2 x double>* %ptr, i64 1 - %v1 = load <2 x double>* %ptr, align 1 + %ptr1 = getelementptr inbounds <2 x double>* %ptr, i64 9 + %ptr2 = getelementptr inbounds <2 x double>* %ptr, i64 10 + %v1 = load <2 x double>* %ptr1, align 1 %v2 = load <2 x double>* %ptr2, align 1 %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <4 x i32> %v4 = fadd <4 x double> %v3, %x -- 2.34.1