From cc62abbe39fdb019fcc71bd4937bb84ced0cd0ca Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Sat, 20 Sep 2014 03:32:25 +0000 Subject: [PATCH] [x86] Generalize the single-element insertion lowering to work with floating point types and use it for both v2f64 and v2i64 single-element insertion lowering. This fixes the last non-AVX performance regression test case I've gotten of for the new vector shuffle lowering. There is obvious analogous lowering for v4f32 that I'll add in a follow-up patch (because with INSERTPS, v4f32 requires special treatment). After that, its AVX stuff. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218175 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 58 ++++++++++++++++++----- test/CodeGen/X86/vector-shuffle-128-v2.ll | 38 +++++++++++++++ 2 files changed, 83 insertions(+), 13 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 94452b1e31a..aaac02f3f80 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7553,7 +7553,7 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( /// /// This is a common pattern that we have especially efficient patterns to lower /// across all subtarget feature sets. -static SDValue lowerIntegerElementInsertionVectorShuffle( +static SDValue lowerVectorShuffleAsElementInsertion( MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); @@ -7561,10 +7561,30 @@ static SDValue lowerIntegerElementInsertionVectorShuffle( int V2Index = std::find_if(Mask.begin(), Mask.end(), [&Mask](int M) { return M >= (int)Mask.size(); }) - Mask.begin(); + if (Mask.size() == 2) { + if (!Zeroable[V2Index ^ 1]) { + // For 2-wide masks we may be able to just invert the inputs. We use an xor + // with 2 to flip from {2,3} to {0,1} and vice versa. + int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), + Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; + if (Zeroable[V2Index]) + return lowerVectorShuffleAsElementInsertion(VT, DL, V2, V1, InverseMask, + Subtarget, DAG); + else + return SDValue(); + } + } else { + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (i != V2Index && !Zeroable[i]) + return SDValue(); // Not inserting into a zero vector. + } - for (int i = 0, Size = Mask.size(); i < Size; ++i) - if (i != V2Index && !Zeroable[i]) - return SDValue(); // Not inserting into a zero vector. + // Step over any bitcasts on either input so we can scan the actual + // BUILD_VECTOR nodes. + while (V1.getOpcode() == ISD::BITCAST) + V1 = V1.getOperand(0); + while (V2.getOpcode() == ISD::BITCAST) + V2 = V2.getOperand(0); // Check for a single input from a SCALAR_TO_VECTOR node. // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and @@ -7579,10 +7599,9 @@ static SDValue lowerIntegerElementInsertionVectorShuffle( SDValue V2S = V2.getOperand(Mask[V2Index] - Mask.size()); // First, we need to zext the scalar if it is smaller than an i32. - MVT EltVT = VT.getVectorElementType(); - assert(EltVT == V2S.getSimpleValueType() && - "Different scalar and element types!"); MVT ExtVT = VT; + MVT EltVT = VT.getVectorElementType(); + V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S); if (EltVT == MVT::i8 || EltVT == MVT::i16) { // Zero-extend directly to i32. ExtVT = MVT::v4i32; @@ -7650,6 +7669,12 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (isShuffleEquivalent(Mask, 1, 3)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2); + // If we have a single input, insert that into V1 if we can do so cheaply. + if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG)) + return Insertion; + if (Subtarget->hasSSE41()) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, DAG)) @@ -7697,6 +7722,13 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (isShuffleEquivalent(Mask, 1, 3)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2); + // If we have a single input from V2 insert that into V1 if we can do so + // cheaply. + if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG)) + return Insertion; + if (Subtarget->hasSSE41()) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, DAG)) @@ -7923,8 +7955,8 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) - if (SDValue V = lowerIntegerElementInsertionVectorShuffle( - MVT::v4i32, DL, V1, V2, Mask, Subtarget, DAG)) + if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2, + Mask, Subtarget, DAG)) return V; if (Subtarget->hasSSE41()) @@ -8604,8 +8636,8 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // There are special ways we can lower some single-element blends. if (NumV2Inputs == 1) - if (SDValue V = lowerIntegerElementInsertionVectorShuffle( - MVT::v8i16, DL, V1, V2, Mask, Subtarget, DAG)) + if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2, + Mask, Subtarget, DAG)) return V; if (Subtarget->hasSSE41()) @@ -8920,8 +8952,8 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) - if (SDValue V = lowerIntegerElementInsertionVectorShuffle( - MVT::v16i8, DL, V1, V2, Mask, Subtarget, DAG)) + if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2, + Mask, Subtarget, DAG)) return V; // Check whether a compaction lowering can be done. This handles shuffles diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll index 06673936586..cc6fa2e7a0f 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -400,6 +400,44 @@ define <2 x i64> @shuffle_v2i64_31_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64 } +define <2 x i64> @insert_reg_and_zero_v2i64(i64 %a) { +; ALL-LABEL: @insert_reg_and_zero_v2i64 +; ALL: movd %rdi, %xmm0 +; ALL-NEXT: retq + %v = insertelement <2 x i64> undef, i64 %a, i32 0 + %shuffle = shufflevector <2 x i64> %v, <2 x i64> zeroinitializer, <2 x i32> + ret <2 x i64> %shuffle +} + +define <2 x i64> @insert_mem_and_zero_v2i64(i64* %ptr) { +; ALL-LABEL: @insert_mem_and_zero_v2i64 +; ALL: movq (%rdi), %xmm0 +; ALL-NEXT: retq + %a = load i64* %ptr + %v = insertelement <2 x i64> undef, i64 %a, i32 0 + %shuffle = shufflevector <2 x i64> %v, <2 x i64> zeroinitializer, <2 x i32> + ret <2 x i64> %shuffle +} + +define <2 x double> @insert_reg_and_zero_v2f64(double %a) { +; ALL-LABEL: @insert_reg_and_zero_v2f64 +; ALL: movq %xmm0, %xmm0 +; ALL-NEXT: retq + %v = insertelement <2 x double> undef, double %a, i32 0 + %shuffle = shufflevector <2 x double> %v, <2 x double> zeroinitializer, <2 x i32> + ret <2 x double> %shuffle +} + +define <2 x double> @insert_mem_and_zero_v2f64(double* %ptr) { +; ALL-LABEL: @insert_mem_and_zero_v2f64 +; ALL: movsd (%rdi), %xmm0 +; ALL-NEXT: retq + %a = load double* %ptr + %v = insertelement <2 x double> undef, double %a, i32 0 + %shuffle = shufflevector <2 x double> %v, <2 x double> zeroinitializer, <2 x i32> + ret <2 x double> %shuffle +} + define <2 x double> @insert_dup_reg_v2f64(double %a) { ; SSE2-LABEL: @insert_dup_reg_v2f64 ; SSE2: movlhps {{.*}} # xmm0 = xmm0[0,0] -- 2.34.1