From 0c0f83ff5d214a7f42e86ae62814526ba40a28cf Mon Sep 17 00:00:00 2001 From: Evan Cheng Date: Sat, 5 Apr 2008 00:30:36 +0000 Subject: [PATCH] Favors pshufd over shufps when shuffling elements from one vector. pshufd is faster than shufps. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@49244 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 92 ++++++++++--------- lib/Target/X86/X86InstrSSE.td | 8 +- .../CodeGen/X86/2007-09-18-ShuffleXformBug.ll | 2 +- .../CodeGen/X86/peep-vector-extract-concat.ll | 4 +- test/CodeGen/X86/vec_set-3.ll | 7 +- test/CodeGen/X86/vec_set-6.ll | 2 +- test/CodeGen/X86/vec_shuffle-13.ll | 6 +- test/CodeGen/X86/vec_shuffle-16.ll | 25 +++++ test/CodeGen/X86/vec_shuffle-5.ll | 2 +- test/CodeGen/X86/vec_shuffle.ll | 6 +- test/CodeGen/X86/vec_splat-2.ll | 2 +- test/CodeGen/X86/vec_splat.ll | 6 +- 12 files changed, 95 insertions(+), 67 deletions(-) create mode 100644 test/CodeGen/X86/vec_shuffle-16.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 8d4b4645b15..f73c3c24c45 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -2782,23 +2782,28 @@ static SDOperand getSwapEltZeroMask(unsigned NumElems, unsigned DestElt, return DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size()); } -/// PromoteSplat - Promote a splat of v8i16 or v16i8 to v4i32. -/// -static SDOperand PromoteSplat(SDOperand Op, SelectionDAG &DAG) { +/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32. +static SDOperand PromoteSplat(SDOperand Op, SelectionDAG &DAG, bool HasSSE2) { + MVT::ValueType PVT = HasSSE2 ? MVT::v4i32 : MVT::v4f32; + MVT::ValueType VT = Op.getValueType(); + if (PVT == VT) + return Op; SDOperand V1 = Op.getOperand(0); SDOperand Mask = Op.getOperand(2); - MVT::ValueType VT = Op.getValueType(); unsigned NumElems = Mask.getNumOperands(); - Mask = getUnpacklMask(NumElems, DAG); - while (NumElems != 4) { - V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V1, Mask); - NumElems >>= 1; + // Special handling of v4f32 -> v4i32. + if (VT != MVT::v4f32) { + Mask = getUnpacklMask(NumElems, DAG); + while (NumElems > 4) { + V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V1, Mask); + NumElems >>= 1; + } + Mask = getZeroVector(MVT::v4i32, DAG); } - V1 = DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, V1); - Mask = getZeroVector(MVT::v4i32, DAG); - SDOperand Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v4i32, V1, - DAG.getNode(ISD::UNDEF, MVT::v4i32), Mask); + V1 = DAG.getNode(ISD::BIT_CONVERT, PVT, V1); + SDOperand Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, PVT, V1, + DAG.getNode(ISD::UNDEF, PVT), Mask); return DAG.getNode(ISD::BIT_CONVERT, VT, Shuffle); } @@ -3426,6 +3431,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) { SDOperand PermMask = Op.getOperand(2); MVT::ValueType VT = Op.getValueType(); unsigned NumElems = PermMask.getNumOperands(); + bool isMMX = MVT::getSizeInBits(VT) == 64; bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; bool V1IsSplat = false; @@ -3443,9 +3449,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) { return V2; if (isSplatMask(PermMask.Val)) { - if (NumElems <= 4) return Op; - // Promote it to a v4i32 splat. - return PromoteSplat(Op, DAG); + if (isMMX || NumElems < 4) return Op; + // Promote it to a v4{if}32 splat. + return PromoteSplat(Op, DAG, Subtarget->hasSSE2()); } // If the shuffle can be profitably rewritten as a narrower shuffle, then @@ -3556,35 +3562,39 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) { return Op; } - // If VT is integer, try PSHUF* first, then SHUFP*. - if (MVT::isInteger(VT)) { - // MMX doesn't have PSHUFD; it does have PSHUFW. While it's theoretically - // possible to shuffle a v2i32 using PSHUFW, that's not yet implemented. - if (((MVT::getSizeInBits(VT) != 64 || NumElems == 4) && - X86::isPSHUFDMask(PermMask.Val)) || - X86::isPSHUFHWMask(PermMask.Val) || - X86::isPSHUFLWMask(PermMask.Val)) { - if (V2.getOpcode() != ISD::UNDEF) - return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, - DAG.getNode(ISD::UNDEF, V1.getValueType()),PermMask); + // Try PSHUF* first, then SHUFP*. + // MMX doesn't have PSHUFD but it does have PSHUFW. While it's theoretically + // possible to shuffle a v2i32 using PSHUFW, that's not yet implemented. + if (isMMX && NumElems == 4 && X86::isPSHUFDMask(PermMask.Val)) { + if (V2.getOpcode() != ISD::UNDEF) + return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, + DAG.getNode(ISD::UNDEF, VT), PermMask); + return Op; + } + + if (!isMMX) { + if (Subtarget->hasSSE2() && + (X86::isPSHUFDMask(PermMask.Val) || + X86::isPSHUFHWMask(PermMask.Val) || + X86::isPSHUFLWMask(PermMask.Val))) { + MVT::ValueType RVT = VT; + if (VT == MVT::v4f32) { + RVT = MVT::v4i32; + Op = DAG.getNode(ISD::VECTOR_SHUFFLE, RVT, + DAG.getNode(ISD::BIT_CONVERT, RVT, V1), + DAG.getNode(ISD::UNDEF, RVT), PermMask); + } else if (V2.getOpcode() != ISD::UNDEF) + Op = DAG.getNode(ISD::VECTOR_SHUFFLE, RVT, V1, + DAG.getNode(ISD::UNDEF, RVT), PermMask); + if (RVT != VT) + Op = DAG.getNode(ISD::BIT_CONVERT, VT, Op); return Op; } - if (X86::isSHUFPMask(PermMask.Val) && - MVT::getSizeInBits(VT) != 64) // Don't do this for MMX. + // Binary or unary shufps. + if (X86::isSHUFPMask(PermMask.Val) || + (V2.getOpcode() == ISD::UNDEF && X86::isPSHUFDMask(PermMask.Val))) return Op; - } else { - // Floating point cases in the other order. - if (X86::isSHUFPMask(PermMask.Val)) - return Op; - if (X86::isPSHUFDMask(PermMask.Val) || - X86::isPSHUFHWMask(PermMask.Val) || - X86::isPSHUFLWMask(PermMask.Val)) { - if (V2.getOpcode() != ISD::UNDEF) - return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, - DAG.getNode(ISD::UNDEF, V1.getValueType()),PermMask); - return Op; - } } // Handle v8i16 specifically since SSE can do byte extraction and insertion. @@ -3595,7 +3605,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) { } // Handle all 4 wide cases with a number of shuffles. - if (NumElems == 4 && MVT::getSizeInBits(VT) != 64) { + if (NumElems == 4 && !isMMX) { // Don't do this for MMX. MVT::ValueType MaskVT = PermMask.getValueType(); MVT::ValueType MaskEVT = MVT::getVectorElementType(MaskVT); diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 7d70480a823..dbc04b01e20 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -2803,13 +2803,7 @@ def : Pat<(vector_shuffle (v2i64 VR128:$src), (undef), UNPCKH_shuffle_mask:$sm), (PUNPCKHQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>; } -// Splat v4f32 -def : Pat<(vector_shuffle (v4f32 VR128:$src), (undef), SSE_splat_mask:$sm), - (SHUFPSrri VR128:$src, VR128:$src, SSE_splat_mask:$sm)>, - Requires<[HasSSE1]>; - // Special unary SHUFPSrri case. -// FIXME: when we want non two-address code, then we should use PSHUFD? def : Pat<(v4f32 (vector_shuffle VR128:$src1, (undef), SHUFP_unary_shuffle_mask:$sm)), (SHUFPSrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>, @@ -2820,7 +2814,7 @@ def : Pat<(v2f64 (vector_shuffle VR128:$src1, (undef), (SHUFPDrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>, Requires<[HasSSE2]>; // Unary v4f32 shuffle with PSHUF* in order to fold a load. -def : Pat<(vector_shuffle (memopv4f32 addr:$src1), (undef), +def : Pat<(vector_shuffle (bc_v4i32 (memopv4f32 addr:$src1)), (undef), SHUFP_unary_shuffle_mask:$sm), (PSHUFDmi addr:$src1, SHUFP_unary_shuffle_mask:$sm)>, Requires<[HasSSE2]>; diff --git a/test/CodeGen/X86/2007-09-18-ShuffleXformBug.ll b/test/CodeGen/X86/2007-09-18-ShuffleXformBug.ll index 18ef151ca71..142bcd33478 100644 --- a/test/CodeGen/X86/2007-09-18-ShuffleXformBug.ll +++ b/test/CodeGen/X86/2007-09-18-ShuffleXformBug.ll @@ -1,6 +1,6 @@ ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep 170 -define i16 @f(<4 x float>* %tmp116117.i1061.i) { +define i16 @f(<4 x float>* %tmp116117.i1061.i) nounwind { entry: alloca [4 x <4 x float>] ; <[4 x <4 x float>]*>:0 [#uses=167] alloca [4 x <4 x float>] ; <[4 x <4 x float>]*>:1 [#uses=170] diff --git a/test/CodeGen/X86/peep-vector-extract-concat.ll b/test/CodeGen/X86/peep-vector-extract-concat.ll index 737da66d1fa..e6c88bbff9d 100644 --- a/test/CodeGen/X86/peep-vector-extract-concat.ll +++ b/test/CodeGen/X86/peep-vector-extract-concat.ll @@ -1,6 +1,6 @@ -; RUN: llvm-as < %s | llc -march=x86-64 -mattr=+sse2,-sse41 | grep {shufps \$3, %xmm0, %xmm0} +; RUN: llvm-as < %s | llc -march=x86-64 -mattr=+sse2,-sse41 | grep {pshufd \$3, %xmm0, %xmm0} -define float @foo(<8 x float> %a) { +define float @foo(<8 x float> %a) nounwind { %c = extractelement <8 x float> %a, i32 3 ret float %c } diff --git a/test/CodeGen/X86/vec_set-3.ll b/test/CodeGen/X86/vec_set-3.ll index 31716bcd8c6..546ca0bcf30 100644 --- a/test/CodeGen/X86/vec_set-3.ll +++ b/test/CodeGen/X86/vec_set-3.ll @@ -1,15 +1,14 @@ ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f -; RUN: grep shufps %t | count 1 -; RUN: grep pshufd %t | count 1 +; RUN: grep pshufd %t | count 2 -define <4 x float> @test(float %a) { +define <4 x float> @test(float %a) nounwind { %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1 ; <<4 x float>> [#uses=1] %tmp5 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1] %tmp6 = insertelement <4 x float> %tmp5, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1] ret <4 x float> %tmp6 } -define <2 x i64> @test2(i32 %a) { +define <2 x i64> @test2(i32 %a) nounwind { %tmp7 = insertelement <4 x i32> zeroinitializer, i32 %a, i32 2 ; <<4 x i32>> [#uses=1] %tmp9 = insertelement <4 x i32> %tmp7, i32 0, i32 3 ; <<4 x i32>> [#uses=1] %tmp10 = bitcast <4 x i32> %tmp9 to <2 x i64> ; <<2 x i64>> [#uses=1] diff --git a/test/CodeGen/X86/vec_set-6.ll b/test/CodeGen/X86/vec_set-6.ll index a28c54491f6..1eeedf184dd 100644 --- a/test/CodeGen/X86/vec_set-6.ll +++ b/test/CodeGen/X86/vec_set-6.ll @@ -2,7 +2,7 @@ ; RUN: grep unpcklps %t | count 1 ; RUN: grep shufps %t | count 1 -define <4 x float> @test(float %a, float %b, float %c) { +define <4 x float> @test(float %a, float %b, float %c) nounwind { %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1 ; <<4 x float>> [#uses=1] %tmp8 = insertelement <4 x float> %tmp, float %b, i32 2 ; <<4 x float>> [#uses=1] %tmp10 = insertelement <4 x float> %tmp8, float %c, i32 3 ; <<4 x float>> [#uses=1] diff --git a/test/CodeGen/X86/vec_shuffle-13.ll b/test/CodeGen/X86/vec_shuffle-13.ll index 6953bf0e4b1..f889e8f279e 100644 --- a/test/CodeGen/X86/vec_shuffle-13.ll +++ b/test/CodeGen/X86/vec_shuffle-13.ll @@ -5,17 +5,17 @@ ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pshuflw | count 1 ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pshufhw | count 1 -define <8 x i16> @t1(<8 x i16> %A, <8 x i16> %B) { +define <8 x i16> @t1(<8 x i16> %A, <8 x i16> %B) nounwind { %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 0, i32 1, i32 10, i32 11, i32 2, i32 3 > ret <8 x i16> %tmp } -define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) { +define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) nounwind { %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 > ret <8 x i16> %tmp } -define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) { +define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) nounwind { %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 0, i32 3, i32 2, i32 4, i32 6, i32 4, i32 7 > ret <8 x i16> %tmp } diff --git a/test/CodeGen/X86/vec_shuffle-16.ll b/test/CodeGen/X86/vec_shuffle-16.ll new file mode 100644 index 00000000000..2e12f6e9bd4 --- /dev/null +++ b/test/CodeGen/X86/vec_shuffle-16.ll @@ -0,0 +1,25 @@ +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse | grep shufps | count 4 +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse | grep mov | count 2 +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pshufd | count 4 +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | not grep shufps +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | not grep mov + +define <4 x float> @t1(<4 x float> %a, <4 x float> %b) nounwind { + %tmp1 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer + ret <4 x float> %tmp1 +} + +define <4 x float> @t2(<4 x float> %A, <4 x float> %B) nounwind { + %tmp = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 3, i32 3, i32 3, i32 3 > + ret <4 x float> %tmp +} + +define <4 x float> @t3(<4 x float> %A, <4 x float> %B) nounwind { + %tmp = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 4, i32 4, i32 4, i32 4 > + ret <4 x float> %tmp +} + +define <4 x float> @t4(<4 x float> %A, <4 x float> %B) nounwind { + %tmp = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 1, i32 3, i32 2, i32 0 > + ret <4 x float> %tmp +} diff --git a/test/CodeGen/X86/vec_shuffle-5.ll b/test/CodeGen/X86/vec_shuffle-5.ll index 9343684946d..1acd73fcba3 100644 --- a/test/CodeGen/X86/vec_shuffle-5.ll +++ b/test/CodeGen/X86/vec_shuffle-5.ll @@ -2,7 +2,7 @@ ; RUN: grep movhlps %t | count 1 ; RUN: grep shufps %t | count 1 -define void @test() { +define void @test() nounwind { %tmp1 = load <4 x float>* null ; <<4 x float>> [#uses=2] %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 > ; <<4 x float>> [#uses=1] %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1] diff --git a/test/CodeGen/X86/vec_shuffle.ll b/test/CodeGen/X86/vec_shuffle.ll index 34c039ac204..f39b9fe2dba 100644 --- a/test/CodeGen/X86/vec_shuffle.ll +++ b/test/CodeGen/X86/vec_shuffle.ll @@ -3,7 +3,7 @@ ; RUN: grep movupd %t | count 1 ; RUN: grep pshufhw %t | count 1 -define void @test_v4sf(<4 x float>* %P, float %X, float %Y) { +define void @test_v4sf(<4 x float>* %P, float %X, float %Y) nounwind { %tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0 ; <<4 x float>> [#uses=1] %tmp2 = insertelement <4 x float> %tmp, float %X, i32 1 ; <<4 x float>> [#uses=1] %tmp4 = insertelement <4 x float> %tmp2, float %Y, i32 2 ; <<4 x float>> [#uses=1] @@ -12,14 +12,14 @@ define void @test_v4sf(<4 x float>* %P, float %X, float %Y) { ret void } -define void @test_v2sd(<2 x double>* %P, double %X, double %Y) { +define void @test_v2sd(<2 x double>* %P, double %X, double %Y) nounwind { %tmp = insertelement <2 x double> zeroinitializer, double %X, i32 0 ; <<2 x double>> [#uses=1] %tmp2 = insertelement <2 x double> %tmp, double %Y, i32 1 ; <<2 x double>> [#uses=1] store <2 x double> %tmp2, <2 x double>* %P ret void } -define void @test_v8i16(<2 x i64>* %res, <2 x i64>* %A) { +define void @test_v8i16(<2 x i64>* %res, <2 x i64>* %A) nounwind { %tmp = load <2 x i64>* %A ; <<2 x i64>> [#uses=1] %tmp.upgrd.1 = bitcast <2 x i64> %tmp to <8 x i16> ; <<8 x i16>> [#uses=8] %tmp.upgrd.2 = extractelement <8 x i16> %tmp.upgrd.1, i32 0 ; [#uses=1] diff --git a/test/CodeGen/X86/vec_splat-2.ll b/test/CodeGen/X86/vec_splat-2.ll index 26e1b8839a6..c6e3dddd5fa 100644 --- a/test/CodeGen/X86/vec_splat-2.ll +++ b/test/CodeGen/X86/vec_splat-2.ll @@ -1,6 +1,6 @@ ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pshufd | count 1 -define void @test(<2 x i64>* %P, i8 %x) { +define void @test(<2 x i64>* %P, i8 %x) nounwind { %tmp = insertelement <16 x i8> zeroinitializer, i8 %x, i32 0 ; <<16 x i8>> [#uses=1] %tmp36 = insertelement <16 x i8> %tmp, i8 %x, i32 1 ; <<16 x i8>> [#uses=1] %tmp38 = insertelement <16 x i8> %tmp36, i8 %x, i32 2 ; <<16 x i8>> [#uses=1] diff --git a/test/CodeGen/X86/vec_splat.ll b/test/CodeGen/X86/vec_splat.ll index c6100ecd094..64222e40ff2 100644 --- a/test/CodeGen/X86/vec_splat.ll +++ b/test/CodeGen/X86/vec_splat.ll @@ -1,7 +1,7 @@ -; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep shufps +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pshufd ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse3 | grep movddup -define void @test_v4sf(<4 x float>* %P, <4 x float>* %Q, float %X) { +define void @test_v4sf(<4 x float>* %P, <4 x float>* %Q, float %X) nounwind { %tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0 ; <<4 x float>> [#uses=1] %tmp2 = insertelement <4 x float> %tmp, float %X, i32 1 ; <<4 x float>> [#uses=1] %tmp4 = insertelement <4 x float> %tmp2, float %X, i32 2 ; <<4 x float>> [#uses=1] @@ -12,7 +12,7 @@ define void @test_v4sf(<4 x float>* %P, <4 x float>* %Q, float %X) { ret void } -define void @test_v2sd(<2 x double>* %P, <2 x double>* %Q, double %X) { +define void @test_v2sd(<2 x double>* %P, <2 x double>* %Q, double %X) nounwind { %tmp = insertelement <2 x double> zeroinitializer, double %X, i32 0 ; <<2 x double>> [#uses=1] %tmp2 = insertelement <2 x double> %tmp, double %X, i32 1 ; <<2 x double>> [#uses=1] %tmp4 = load <2 x double>* %Q ; <<2 x double>> [#uses=1] -- 2.34.1